1.什么是KNN
K近邻算法(K-Nearest Neighbour,K-NN)是一种基本分类与回归方法,是一个理论上比较成熟的方法,也是最简单的机器学习算法之一。该方法的思路是:如果一个样本在特征空间中的K个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类别。
K近邻算法简单,直观。给定一个训练数据集,对于新的输入实例,在训练数据集中找到与该实例最邻近的K个实例,这K个实例的多数某个类,就把该输入实例归入这个类。K近邻法没有显示的学习过程。
2.算法步骤
step.1---初始化距离为最大值
step.2---计算未知样本和每个训练样本的距离dist
step.3---得到目前K个最临近样本中的最大距离maxdist
step.4---如果dist小于maxdist,则将该训练样本作为K-最近邻样本
step.5---重复步骤2、3、4,直到未知样本和所有训练样本的距离都算完
step.6---统计K个最近邻样本中每个类别出现的次数
step.7---选择出现频率最大的类别作为未知样本的类别
s
实例1-Iris 数据集
import numpy as np
import operator
'''
样本(m) × 特征(n)
x1 x2 x3 x4 .... xn
1
2
...
m
m个样本的标签(1 * m) 二维矩阵
1 2 3 ... m
测试数据的特征(1 * n)二维矩阵
x1 x2 x3 x4 ... xn
尽管是一维的矩阵,但是写成二维的便于统一格式
'''
def Manhattan(dataSet,testdata):
m = np.shape(dataSet)[0]
distance = np.zeros(m)
for i in range(m):
distance[i] = np.sqrt( np.sum(np.abs(dataSet[i,:] - testdata[0,:])) )
return distance
def Euclidean(dataSet,testdata):
m = np.shape(dataSet)[0]
distance = np.zeros(m)
for i in range(m):
distance[i] = np.sqrt( np.sum( np.power( dataSet[i,:] - testdata[0,:] , 2) ) )
return distance
def Cosine(dataSet,testdata):
m = np.shape(dataSet)[0]
distance = np.zeros(m)
for i in range(m):
# \ 换行符 分行写代码
#原值取负,距离值越小相似度越高
distance[i] = - ( np.sum(dataSet[i,:] * testdata[0,:] ) ) / ( np.sqrt(np.sum( \
np.power(dataSet[i,:],2))) *np.sqrt(np.sum(np.power(testdata[0,:],2))) )
return distance
def Pearson(dataSet,testdata):
m = np.shape(dataSet)[0]
n = np. shape(dataSet)[1]
distance = np.zeros(m)
ave_t = np.sum(testdata[0,:]) / n
for i in range(m):
ave_d = np.sum(dataSet[i,:]) / n
#原值取负,距离值越小相似度越高
distance[i] = - np.abs( np.sum((testdata[0,:] - ave_t) * (dataSet[i,:] - ave_d)) / np.sqrt \
( np.sum(np.power(testdata[0,:]- ave_t,2))* np.sum(np.power(dataSet[i,:] - ave_d,2))) )
return distance
def Jaccard():
return
#训练集,测试数据,训练集标签,距离方式,k值
def Knn(dataSet,dataSet_label,testdata,distance_way,k):
#dist_sort是disance的升序排列的索引列表
dist_sort = distance_way(dataSet, testdata).argsort()
#创建一个{类别:频数}的字典
classcount = {}
#遍历前k个样本
for i in range(k):
label = dataSet_label[0,dist_sort[i]]
classcount[label] = classcount.get( label , 0 ) + 1
#classcount_sort为classcount按照 '值' 序列的列表
classcount_sort = sorted(classcount.items(), key = operator.itemgetter(1),reverse = True)
#返回频数最高对应的标签
return classcount_sort[0][0]
def get_data(txt_name):
#计算txt文件的行数
count = len(open(txt_name).readlines())
#创建矩阵
data = np.zeros(shape = (count,4))
label = np.zeros(shape = (count,1))
label = label.astype(str)
i = 0
file = open(txt_name)
for line in file.readlines():
#strip()消除行首和行尾的空白
#split()以参数为间隔符,提取数据,返回字符串列表
cutline = line.strip().split(',')
data[i,:]= np.array(cutline[0:-1],dtype = float)
label[i,0] = cutline[-1]
i += 1
return data,label.T
'''
---------------------------------------------------begin----------------------------------------------
'''
a = get_data('Iris.txt')
dataSet,dataSet_label = a[0],a[1]
testdata = np.array([
[5.4,3.7,1.5,0.2]
],dtype = float)
print(Knn(dataSet,dataSet_label,testdata,Manhattan,10))
实例2-手写数字识别
数据集来源
https://www.jianshu.com/p/3d0bba113fd6
用的是前两个数据
实现代码:
import struct
import numpy as np
import matplotlib.pyplot as plt
import operator
def Manhattan(dataSet,testdata):
m = np.shape(dataSet)[0]
distance = np.zeros(m)
for i in range(m):
distance[i] = np.sqrt( np.sum(np.abs(dataSet[i,:] - testdata[0,:])) )
return distance
def Euclidean(dataSet,testdata):
m = np.shape(dataSet)[0]
distance = np.zeros(m)
for i in range(m):
distance[i] = np.sqrt( np.sum( np.power( dataSet[i,:] - testdata[0,:] , 2) ) )
return distance
def Cosine(dataSet,testdata):
m = np.shape(dataSet)[0]
distance = np.zeros(m)
for i in range(m):
# \ 换行符 分行写代码
#原值取负,距离值越小相似度越高
distance[i] = - ( np.sum(dataSet[i,:] * testdata[0,:] ) ) / ( np.sqrt(np.sum( \
np.power(dataSet[i,:],2))) *np.sqrt(np.sum(np.power(testdata[0,:],2))) )
return distance
def Pearson(dataSet,testdata):
m = np.shape(dataSet)[0]
n = np. shape(dataSet)[1]
distance = np.zeros(m)
ave_t = np.sum(testdata[0,:]) / n
for i in range(m):
ave_d = np.sum(dataSet[i,:]) / n
#原值取负,距离值越小相似度越高
distance[i] = - np.abs( np.sum((testdata[0,:] - ave_t) * (dataSet[i,:] - ave_d)) / np.sqrt \
( np.sum(np.power(testdata[0,:]- ave_t,2))* np.sum(np.power(dataSet[i,:] - ave_d,2))) )
return distance
def Knn(dataSet,dataSet_label,testdata,distance_way,k):
#dist_sort是disance的升序排列的索引列表
dist_sort = distance_way(dataSet, testdata).argsort()
#创建一个{类别:频数}的字典
classcount = {}
#遍历前k个样本
for i in range(k):
label = dataSet_label[0,dist_sort[i]]
classcount[label] = classcount.get( label , 0 ) + 1
#classcount_sort为classcount按照 '值' 序列的列表
classcount_sort = sorted(classcount.items(), key = operator.itemgetter(1),reverse = True)
#返回频数最高对应的标签
return classcount_sort[0][0]
# 读入mnist文件
def load_mnist(label_path, image_path):
with open(label_path, 'rb') as lbpath:
magic, n = struct.unpack('>II', lbpath.read(8))
label = np.fromfile(lbpath, dtype=np.uint8)
with open(image_path, 'rb') as imgpath:
magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
image = np.fromfile(imgpath, dtype=np.uint8).reshape(len(label), 784)
# 60000个训练集及其标签
# label 60000 一维矩阵
# image 60000 * 784 (784 = 28 *28)
return label, image
'''
--------------------------------begin--------------------------------------------
'''
a = load_mnist('train-labels.idx1-ubyte', 'train-images.idx3-ubyte')
# 一维换二维,统一格式
dataSet_label, dataSet = a[0].reshape(1, 60000), a[1]
# 测试数据,采用了训练集的第57个样本
testdata = dataSet[8999, :].reshape(1, 784)
# k值选取10
print(Knn(dataSet, dataSet_label, testdata, Euclidean, 10))
# 将测试数据显示
plt.imshow(testdata.reshape(28, 28), cmap='Greys')
plt.show()
运行结果