python实现K-近邻算法
摘要:本文首先浅谈了自己对K-近邻算法的理解,进而通过Python一步步实现K-近邻算法,并通过Matplotlib对数据可视化,最后,选取相应的测试数据集对算法进行测试。
关键词:机器学习, k-近邻算法 , python, matplotlib
1、简介
k-近邻算法(K Nearest Neighbor,KNN),简单来说就是采用测量不同特征值之间的距离方法进行分类。即给定一个训练数据集,并且样本数据集中的每个数据都存在标签,对新的输入实例,在训练数据集中找到与该实例最邻近的K个实例(也就是上面所说的K个邻居), 这K个实例的多数属于某个类,就把该输入实例分类到这个类中。
在进行最近点计算时,采用欧式距离公式,计算两个向量点xA和xB之间的距离:
2、数据准备
数据以txt文件格式存储,包括三类特征:
1. 每年运动的总里程数
2. 每周打游戏所占时间百分比
3. 每天喝饮料的升数
从文本读取数据代码:
def file2matrix(filename):
"""读取文件
返回矩阵
"""
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
returnMat= np.zeros([numberOfLines,3])
classLabelVector = []
index = 0
for line in arrayOfLines:
listFromLine = line.strip().split()
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index+=1
return returnMat,classLabelVector
使用matplotlib工具包可视化上述数据:
def test2():
"""
测试画图
:return:
"""
datingDataMat, datingLabels= file2matrix('./datingTestSet2.txt')
datingDataMat,_,_ = autoNorm(datingDataMat)
print(datingDataMat)
fig = plt.figure()
ax = Axes3D(fig)
#ax = fig.add_subplot(111)
datingLabels = np.array(datingLabels)
id_x1 = np.where(datingLabels==1)
id_x2 = np.where(datingLabels==2)
id_x3 = np.where(datingLabels==3)
p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],datingDataMat[id_x1,2],marker='o',c='r',s=10)
p2 = ax.scatter(datingDataMat[id_x2,0],datingDataMat[id_x2,1],datingDataMat[id_x2,2],marker='o',c='g',s=20)
p3 = ax.scatter(datingDataMat[id_x3,0],datingDataMat[id_x3,1],datingDataMat[id_x3,2],marker='o',c='b',s=30)
#ax.scatter(datingDataMat[:,0],datingDataMat[:,1],datingDataMat[:,2],15*np.array(datingLabels),15*np.array(datingLabels),15*np.array(datingLabels))
# plt.xlabel(r'玩游戏所耗的时间百分比')
# plt.ylabel(r'每周消费的冰淇淋公升数')
ax.set_xlabel(r'每周打游戏所占时间百分比')
ax.set_ylabel(r'每天喝饮料的升数')
ax.set_zlabel(r'每年运动的总里程数')
ax.set_xlim(-0.1,1.1)
ax.set_ylim(-0.1,1.1)
ax.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])
# plt.xlim(-0.1,1.1)
# plt.ylim(-0.1,1.1)
fig = plt.figure(2)
ax = fig.add_subplot(111)
datingLabels = np.array(datingLabels) # 从列表变成一个数组
id_x1 = np.where(datingLabels == 1)
p1 = ax.scatter(datingDataMat[id_x1, 1], datingDataMat[id_x1, 2], marker="o", c='r', label=r'不喜欢', s=10)
id_x2 = np.where(datingLabels == 2)
p2 = ax.scatter(datingDataMat[id_x2, 1], datingDataMat[id_x2, 2], marker="o", c='g', label=r'魅力一般', s=20)
id_x3 = np.where(datingLabels == 3)
p3 = ax.scatter(datingDataMat[id_x3, 1], datingDataMat[id_x3, 2], marker="o", c='b', label=r'极具魅力', s=30)
plt.xlabel(r'玩游戏所耗的时间百分比')
plt.ylabel(r'每天喝饮料的升数')
plt.ylim(0,1)
plt.legend(handles=[p1, p2, p3], labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])
fig = plt.figure(3)
ax = fig.add_subplot(111)
datingLabels = np.array(datingLabels) # 从列表变成一个数组
id_x1= np.where(datingLabels==1)
p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],marker ="o",c='r',label=r'不喜欢',s=10 )
id_x2 = np.where(datingLabels == 2)
p2 = ax.scatter(datingDataMat[id_x2, 0], datingDataMat[id_x2, 1], marker="o", c='g', label=r'魅力一般', s=20)
id_x3 = np.where(datingLabels == 3)
p3 = ax.scatter(datingDataMat[id_x3, 0], datingDataMat[id_x3, 1], marker="o", c='b', label=r'极具魅力', s=30)
plt.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'],loc='best')
plt.xlabel(r'每年运动的总里程数')
plt.ylabel(r'玩游戏所耗的时间百分比')
plt.xlim(0,1)
plt.show()
归一化处理
由于采用欧式距离作为计算最近点的方式,不同属性对计算结果会造成影响。
采用数值归一化方法可以避免这种问题。
def autoNorm(dataSet): # 数组np.array [mxn]
"""归一化"""
minVals = dataSet.min(0)#[1xn]
maxVals = dataSet.max(0)#[1xn]
ranges = maxVals - minVals#[1xn]
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals,[m,1])
normDataSet = normDataSet / np.tile(ranges,[m,1])
return normDataSet,ranges,minVals
分类算法:
参数:
inX–输入(待分类数据)
dataSet–以获取的数据集
labels–对应的分类标签
k–近邻值(一般小于20)
def classify0(inX,dataSet,labels,k):
"""
分类
:param inX: input
:param dataSet:数据集
:param labels:标签
:param k:K邻近
:return:
"""
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {} #这是一个dict,用于存储不同标签出现的次数;
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]] # 键值 labels = ['A','A','B','B']
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)# 从大到小
return sortedClassCount[0][0]
测试算法:
100组作为测试输入
900组作为数据集
def datingClassTest():
"""
分类器验证
:return:
"""
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
m = normMat.shape[0] #
numTestVecs = int(m*hoRatio)
print(numTestVecs)
errorCount =0
for i in range(numTestVecs):
classifierResult = classify0(inX=normMat[i,:],
dataSet=normMat[numTestVecs:m,:],
labels=datingLabels[numTestVecs:m],
k=4)
print("the classifier came back with :%d,the real answer is %d"
%(classifierResult,datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1
print("the total error rate is %f"%(errorCount / float(numTestVecs)))
验证结果:
小结
- 一个分类(聚类)算法,通过计算K个最邻近的,并取某个类的多数作为最终分类
- 数据归一化处理,避免某个属性的差异导致较大的误差
- matplotlib工具包的应用
参考文献
[1].PeterHarrington, 哈林顿, 李锐. 机器学习实战[M]. 人民邮电出版社, 2013.
完整代码:
import numpy as np
import operator
import os
import matplotlib.pyplot as plt
from pylab import mpl
from mpl_toolkits.mplot3d import Axes3D
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False #解决保存图像是负号'-'显示为方块的问题
def createDataSet():
group = np.array([[1.0,1.1],
[1.0,1.0],
[0,0],
[0,0.1]])
labels = ['A','A','B','B']
return group,labels
def classify0(inX,dataSet,labels,k):
"""
分类
:param inX: input
:param dataSet:数据集
:param labels:标签
:param k:K邻近
:return:
"""
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {} #这是一个dict,用于存储不同标签出现的次数;
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]] # 键值 labels = ['A','A','B','B']
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)# 从大到小
return sortedClassCount[0][0]
def file2matrix(filename):
"""读取文件
返回矩阵
"""
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
returnMat= np.zeros([numberOfLines,3])
classLabelVector = []
index = 0
for line in arrayOfLines:
listFromLine = line.strip().split()
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index+=1
return returnMat,classLabelVector
def autoNorm(dataSet): # 数组np.array [mxn]
"""归一化"""
minVals = dataSet.min(0)#[1xn]
maxVals = dataSet.max(0)#[1xn]
ranges = maxVals - minVals#[1xn]
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals,[m,1])
normDataSet = normDataSet / np.tile(ranges,[m,1])
return normDataSet,ranges,minVals
def datingClassTest():
"""
分类器验证
:return:
"""
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
m = normMat.shape[0] #
numTestVecs = int(m*hoRatio)
print(numTestVecs)
errorCount =0
for i in range(numTestVecs):
classifierResult = classify0(inX=normMat[i,:],
dataSet=normMat[numTestVecs:m,:],
labels=datingLabels[numTestVecs:m],
k=4)
print("the classifier came back with :%d,the real answer is %d"
%(classifierResult,datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1
print("the total error rate is %f"%(errorCount / float(numTestVecs)))
def classifyPerson():
resultList = ['不受欢迎', '一般受欢迎', '非常受欢迎']
percentTats = float(input('每周打游戏所占时间百分比(0-100):'))
ffMiles = float(input("每年运动的总里程数(0-100000):"))
iceCream = float(input('每天喝饮料的升数(0-10):'))
datingDataMat ,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inarr = np.array([ffMiles,percentTats,iceCream])
result = classify0(inX=(inarr-minVals)/ranges,dataSet=normMat,labels=datingLabels,k=3)
print("你是一个这样的人:",resultList[result-1])
def img2vector(filename):
returnVector = np.zeros([1,1024])# 行向量
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVector[0,32*i+j] = int(lineStr[j])
return returnVector
def handwritingClassTest():
hwLabels = []
trainingFileList = os.listdir('digits\\trainingDigits')
#print(trainingFileList)
m =len(trainingFileList)# m个数据
trainingMat = np.zeros([m,1024])
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] # 0_0
classNumStr = int(fileStr.split('_')[0]) # 标签
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('digits\\trainingDigits\\%s' % fileNameStr)
testFileList = os.listdir('digits\\testDigits')
mTest = len(testFileList)
errorCount = 0
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('digits\\testDigits\\%s'%fileNameStr)
classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels,5)
print('分类结果为:%d,真实结果为:%d'%(classifierResult,classNumStr))
if classifierResult != classNumStr:
errorCount+=1
print("\n错误的总个数:%d"%errorCount)
print('\n总个数:%d'%mTest)
print('\n误差率:%f'%(errorCount/float(mTest)))
def test1():
"""用于测试K 近邻算法(KNN)"""
group, labels = createDataSet()
fig = plt.figure()
plt.scatter(group[:, 0], group[:, 1])
inX = [[0, 1],
[0, 0],
[2, 2]]
ans = []
for x in inX:
a = classify0(x, group, labels, 3)
ans.append(a)
print(ans)
plt.show()
def test2():
"""
测试画图
:return:
"""
datingDataMat, datingLabels= file2matrix('./datingTestSet2.txt')
datingDataMat,_,_ = autoNorm(datingDataMat)
print(datingDataMat)
fig = plt.figure()
ax = Axes3D(fig)
#ax = fig.add_subplot(111)
datingLabels = np.array(datingLabels)
id_x1 = np.where(datingLabels==1)
id_x2 = np.where(datingLabels==2)
id_x3 = np.where(datingLabels==3)
p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],datingDataMat[id_x1,2],marker='o',c='r',s=10)
p2 = ax.scatter(datingDataMat[id_x2,0],datingDataMat[id_x2,1],datingDataMat[id_x2,2],marker='o',c='g',s=20)
p3 = ax.scatter(datingDataMat[id_x3,0],datingDataMat[id_x3,1],datingDataMat[id_x3,2],marker='o',c='b',s=30)
#ax.scatter(datingDataMat[:,0],datingDataMat[:,1],datingDataMat[:,2],15*np.array(datingLabels),15*np.array(datingLabels),15*np.array(datingLabels))
# plt.xlabel(r'玩游戏所耗的时间百分比')
# plt.ylabel(r'每周消费的冰淇淋公升数')
ax.set_xlabel(r'每周打游戏所占时间百分比')
ax.set_ylabel(r'每天喝饮料的升数')
ax.set_zlabel(r'每年运动的总里程数')
ax.set_xlim(-0.1,1.1)
ax.set_ylim(-0.1,1.1)
ax.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])
# plt.xlim(-0.1,1.1)
# plt.ylim(-0.1,1.1)
fig = plt.figure(2)
ax = fig.add_subplot(111)
datingLabels = np.array(datingLabels) # 从列表变成一个数组
id_x1 = np.where(datingLabels == 1)
p1 = ax.scatter(datingDataMat[id_x1, 1], datingDataMat[id_x1, 2], marker="o", c='r', label=r'不喜欢', s=10)
id_x2 = np.where(datingLabels == 2)
p2 = ax.scatter(datingDataMat[id_x2, 1], datingDataMat[id_x2, 2], marker="o", c='g', label=r'魅力一般', s=20)
id_x3 = np.where(datingLabels == 3)
p3 = ax.scatter(datingDataMat[id_x3, 1], datingDataMat[id_x3, 2], marker="o", c='b', label=r'极具魅力', s=30)
plt.xlabel(r'玩游戏所耗的时间百分比')
plt.ylabel(r'每天喝饮料的升数')
plt.ylim(0,1)
plt.legend(handles=[p1, p2, p3], labels=['不受欢迎', '一般受欢迎', '非常受欢迎'])
fig = plt.figure(3)
ax = fig.add_subplot(111)
datingLabels = np.array(datingLabels) # 从列表变成一个数组
id_x1= np.where(datingLabels==1)
p1 = ax.scatter(datingDataMat[id_x1,0],datingDataMat[id_x1,1],marker ="o",c='r',label=r'不受欢迎',s=10 )
id_x2 = np.where(datingLabels == 2)
p2 = ax.scatter(datingDataMat[id_x2, 0], datingDataMat[id_x2, 1], marker="o", c='g', label=r'一般受欢迎', s=20)
id_x3 = np.where(datingLabels == 3)
p3 = ax.scatter(datingDataMat[id_x3, 0], datingDataMat[id_x3, 1], marker="o", c='b', label=r'非常受欢迎', s=30)
plt.legend(handles=[p1,p2,p3],labels=['不受欢迎', '一般受欢迎', '非常受欢迎'],loc='best')
plt.xlabel(r'每年运动的总里程数')
plt.ylabel(r'玩游戏所耗的时间百分比')
plt.xlim(0,1)
plt.show()
def test3():
datingDataMat, datingLabels = file2matrix('./datingTestSet2.txt')
normMat ,ranges,minVals = autoNorm(dataSet=datingDataMat)
print(normMat,ranges,minVals)
def test4():
"""
将图片转化为一个行向量
:return:
"""
vector= img2vector('digits/testDigits/0_0.txt')
print(vector[0,0:31])
def test5():
handwritingClassTest()
if __name__ == "__main__":
#test1() #用于测试K 近邻算法(KNN)
#test2() # 绘图测试画图
#test3() # 测试归一化
#datingClassTest() # 检测正确率
#classifyPerson()
#test4()
test5()