knn是最简单的分类算法,基于经验,利用各种距离进行分类
下面给出一种简单的实现:
'''
手动实现KNN算法
'''
import numpy as np
class KNN:
def __init__(self,exp_x,exp_y,weights=1,distance_func=None):
'''
:param exp_x: 样本的特征向量集
:param exp_y: 对应的标签集
:param weights: 每个特征对应的权重
:param distance_func: 用于计算距离的函数,distance_func(simple_feature_vector,target_feature_vector)
'''
self.__weights=weights
if not isinstance(exp_x,np.ndarray):
self.__x=np.array(exp_x)
self.__y=exp_y
if distance_func is None:
self.__distance_func = self.default_dist
else:
self.__distance_func = distance_func
def default_dist(self,vector,target_vector,weights):
return np.sum(((target_vector - vector)*weights) ** 2) ** 0.5
def predict(self,target_vector,k=5):
'''
:param target_vector: 需要预测的特征向量
:param k: 结果取前k个
:return: 返回结果标签
'''
assert k<len(self.__y),'the k must be less the length of exp_y'
if not isinstance(target_vector,np.ndarray):
target_vector=np.array(target_vector)
all_dist=[]#将所有距离的结果放入一个列表中,如果数据量比较大则需要使用数据库进行存储
for vector,tag in zip(self.__x,self.__y):
ds=self.__distance_func(vector,target_vector,self.__weights)
all_dist.append((ds,tag))
all_dist.sort(key=lambda x:x[0])
result_list=all_dist[:k+1]
freq={} #统计标签频率
for ds,tag in result_list:
freq[tag] = freq.get(tag,0) + 1
return max(freq.items(),key=lambda x:x[1])[0] #返回频率高的标签
if __name__ == '__main__':
x=[[0,0],
[1,1],
[-1,-1],
[10,0],
[11,0],
[12,0],]
y=['A','A','A','B','B','B']
knn=KNN(x,y)
r=knn.predict([7,6],2)
print(r)
测试结果:
"D:\Program Files\Python\Python36\python.exe" D:/Workspace/测试空间/机器学习/实现knn算法.py
B
Process finished with exit code 0