1.什么是K临近(KNN)?
K近邻(K-nearst neighbors,KNN)是一种基本的机器学习算法,所谓k近邻,就是k个最近的邻居的意思,说的是每个样本都可以用它最接近的k个邻居来代表。 比如:判断一个人的人品,只需要观察与他来往最密切的几个人的人品好坏就可以得出,即“近朱者赤,近墨者黑";KNN算法既可以应用于分类应用中,也可以应用在回归应用中。
2.算法实现(训练集)
我这边是以4特征的鸢尾花作为数据集,需要数据集可以私聊
计算误差公式:
1.导入数据集
def init(adder):
f = open(adder) # 打开数据文件文件
lines = f.readlines() # 把全部数据文件读到一个列表lines中
data_test = []
for line in lines: # 把lines中的数据逐行读取出来
data_test.append(line.rstrip('\n').split(' '))#换行
data_temp = np.array(data_test)#变成矩阵
return data_temp
2.计算输入特征点和每个数据的距离
def argue(need_judge, data_temp):
times = data_temp.shape[0]#获取数据个数
distance_all = np.zeros((1, 2))
for time in range(times):
temp = data_temp[time]#遍历矩阵每行
temp_temp = temp[0:4]
distance_temp = distance_out(need_judge, temp_temp)
new_row = np.array((distance_temp, temp[4]), dtype=str)
new_row = np.array(new_row).reshape((1, 2))
distance_all = np.r_[distance_all, new_row]#合并矩阵
return distance_all#返回带有距离差的和标签的矩阵
def distance_out(need_judge, source):#获取距离
source = source.astype(float)
x = need_judge[0] - source[0]
y = need_judge[1] - source[1]
z = need_judge[2] - source[2]
r = need_judge[3] - source[3]
distance = math.sqrt((x ** 2 + y ** 2 + z ** 2 + r ** 2))
return distance
3.从处理完的矩阵中获取最小K个距离的标签
def find_suitable(distance_all, K):
new_distance_all = distance_all[1:]#去掉矩阵第一行
sorted_indices = np.argsort(new_distance_all[:, 0])
min_k_rows = new_distance_all[sorted_indices[:K], :]
# print(min_k_rows)
min_name = min_k_rows[:, 1]
# 将数组转换为Python列表,并使用Counter类进行计数
counts = Counter(min_name.tolist())
# 找到出现次数最多的字符串
most_common_str = max(counts, key=counts.get)
# print("出现最多的字符串是:", most_common_str)
return most_common_str
以上训练集的部分就完成了,下面通过验证集来判断代码的效果
3.算法实现(验证集)
def train(need_judge,K):
data = init(train_address)
distance_all = argue(need_judge, data)
kinds = find_suitable(distance_all, K)
return kinds#返回的是标签
def ua(K):
data = init(prove_address)#读取验证集的地址
data_number = data.shape[0]
right = 0
faid = 0
# print(data)
count = 0
for temp in data:#便利验证集里面每组数据
count += 1
print("正在进行第", count, "次验证")
x, y, z, r = get_point(temp)
# print(get_point(temp))
need_judge = np.array([x, y, z, r], dtype=float)
judge = train(need_judge,K)
if (judge == temp[4]):#判断输出标签和验证集标签是否一样
right += 1
else:
faid += 1
right_faid = (float)((right) / (data.shape[0]))
print("当前模型的成功率是百分之", right_faid * 100)
4.过程中遇到的问题
1.合并矩阵是时候浪费了我很多时间
distance_temp = distance_out(need_judge, temp_temp)
new_row = np.array((distance_temp, temp[4]), dtype=str)
new_row = np.array(new_row).reshape((1, 2))
distance_all = np.r_[distance_all, new_row]#合并矩阵
上面展示的这段代码,需要注意的是distance_all 一定要和new_row 变成同纬度的矩阵,在获取new_row的过程中,很容易把他变成数组,导致矩阵无法扩容。
5.完整代码
# coding=UTF-8
import math
from collections import Counter
import numpy as np
train_address = "./iris.txt"
prove_address = "./test.txt"
def init(adder):
f = open(adder) # 打开数据文件文件
lines = f.readlines() # 把全部数据文件读到一个列表lines中
data_test = []
for line in lines: # 把lines中的数据逐行读取出来
data_test.append(line.rstrip('\n').split(' '))
data_temp = np.array(data_test)
return data_temp
def argue(need_judge, data_temp):
times = data_temp.shape[0]
distance_all = np.zeros((1, 2))
for time in range(times):
temp = data_temp[time]
temp_temp = temp[0:4]
distance_temp = distance_out(need_judge, temp_temp)
new_row = np.array((distance_temp, temp[4]), dtype=str)
new_row = np.array(new_row).reshape((1, 2))
distance_all = np.r_[distance_all, new_row]
return distance_all
def distance_out(need_judge, source):
source = source.astype(float)
x = need_judge[0] - source[0]
y = need_judge[1] - source[1]
z = need_judge[2] - source[2]
r = need_judge[3] - source[3]
distance = math.sqrt((x ** 2 + y ** 2 + z ** 2 + r ** 2))
return distance
def find_suitable(distance_all, K):
new_distance_all = distance_all[1:]
sorted_indices = np.argsort(new_distance_all[:, 0])
min_k_rows = new_distance_all[sorted_indices[:K], :]
# print(min_k_rows)
min_name = min_k_rows[:, 1]
# 将数组转换为Python列表,并使用Counter类进行计数
counts = Counter(min_name.tolist())
# 找到出现次数最多的字符串
most_common_str = max(counts, key=counts.get)
# print("出现最多的字符串是:", most_common_str)
return most_common_str
def train(need_judge,K):
data = init(train_address)
distance_all = argue(need_judge, data)
kinds = find_suitable(distance_all, K)
return kinds
def get_point(data):
return data[0], data[1], data[2], data[3]
def ua(K):
data = init(prove_address)
data_number = data.shape[0]
right = 0
faid = 0
# print(data)
count = 0
for temp in data:
count += 1
print("正在进行第", count, "次验证")
x, y, z, r = get_point(temp)
# print(get_point(temp))
need_judge = np.array([x, y, z, r], dtype=float)
judge = train(need_judge,K)
if (judge == temp[4]):
right += 1
else:
faid += 1
right_faid = (float)((right) / (data.shape[0]))
print("当前模型的成功率是百分之", right_faid * 100)
if __name__ == '__main__':
while True:
choose = input("1.查看准确率"
"2.开始预测"
"3.退出\n")
print(choose)
K = input("输入K: ")
K = int(K)
if choose == '1':
ua(K)
elif choose == '2':
x = input("输入特征值1: ")
y = input("输入特征值2: ")
z = input("输入特征值3: ")
r = input("输入特征值4: ")
need_judge = np.array([x, y, z, r], dtype=float)
kinds = train(need_judge,K)
print("根据您输入的预测值,我雯耶同学猜的是",kinds)
elif choose == '3':
break
else:
print("输入错误")
print("欢迎下次使用")
6.KNN的缺点和优点
优点:算法简单,方便实现,准确率较高
缺点:使用有局限性,对于图片来说不是简单的距离关系。要便利数据集,内存消耗大,时间久