KNN: 分类
也可用于回归
参考资料:KNN
"""
@File : KNN.py
@Time : 2020-7-30
"""
import pandas as pd
import numpy as np
import time
from collections import Counter
def loadData(fileName):
#从文件中读取数据
data=pd.read_csv(fileName,header=None)
data=data.values #dataframe转化为ndarray
y_label=data[:,0] #第一列
x_label=np.mat(data[:,1:])
return x_label/255,y_label #x_label归一化
def caculDistance(x1,x2): #计算两个点的距离 欧式距离 dis=sqrt(square(x1-x2)) 1x784 行向量有np.sum
return np.sqrt(np.sum(np.square(x1-x2)))
def findCluster(x_train,y_train,x,k): #x待分类数据 k最近k个点的k值
distances=[]
for xx in x_train: #遍历:计算x与each样本的距离
dis=caculDistance(xx,x)
distances.append(dis)
minK=np.argsort(distances)[:k]
result=[]
for i in range(k): # 将对应分类结果添加到result中
result.append(y_train[minK[i]])
belonging=Counter(result).most_common(1)[0][0] #只取元素
return belonging
def tt(x_train,y_train,x_test,y_test,k):
# 只测试50个
acc_num,acc=0,0
for i in range(50): #range(len(x_test))
cluster=findCluster(x_train,y_train,x_test[i],k)
if cluster==y_test[i]: acc_num+=1
print(f'find {i}th data cluster: cluster_pred={cluster},cluster={y_test[i]}')
print('now_acc=',acc_num/(i+1))
if __name__=="__main__":
start=time.time()
x_train, y_train=loadData('Mnist/mnist_train.csv')
x_test, y_test = loadData('Mnist/mnist_test.csv')
tt(x_train,y_train,x_test,y_test,k=20)
end=time.time()
print('run time=',end-start)
—————————————————————————
新收获命令
# Counter对象:任意可哈希(hashable)元素构成的序列对象
Counter.most_common(k) #Top k
out:[('eyes', 8), ('the', 5), ('look', 4)] #(元素,出现次数) k=3
#argsort()返回的是数组值从小到大的索引值
#argsort()[num]返回前0~k-1个索引值
x=np.array([1,4,3,-1,6,9])
x.argsort()
Out[3]: array([3, 0, 2, 1, 4, 5], dtype=int64)