最近开始学习机器学习实战,第一个就是KNN,由于K-近邻算法比较简单,这里不再介绍理论知识,直接看代码实现:
KNN的简单实现
需要用到的一些语法:
tile()
sum(axis=1)
argsort,sort 和 sorted,operator.itemgetter函数
get(),items(),iteritems()方法
# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt
# 建立训训练集和相应的标签
def createDataset():
# 数组,注意此处是两个中括号
group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels=['A','A','B','B']
return (group,labels)
# 简单分类
def classify0(inX,dataSet,labels,k):
#shape[0]得到的是矩阵行数,shape[1]得到列数
dataSetSize=dataSet.shape[0]
# tile()得到和dataset相同的维数,进行相减
diffMat=tile(inX,(dataSetSize,1))-dataSet
#print(diffMat)
# 各向量相减后平方
sqDiffMat = diffMat**2
#print(sqDiffMat)
# axis=1按行求和,得到了平方和
sqDistances = sqDiffMat.sum(axis=1)
#print(sqDistances)
# 开根号,求得输入向量和训练集各向量的欧氏距离
distances = sqDistances**0.5
#print(distances)
# 得到各距离索引值,是升序,即最小距离到最大距离
sortedDistIndicies = distances.argsort()
#print( sortedDistIndicies)
classCount={}
for i in range(k):
# 前k个最小距离的标签
voteIlabel = labels[sortedDistIndicies[i]]
#print( voteIlabel)
# 累计投票数
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
print('classCount:',classCount)
# 把分类结果进行排序,然后返回得票数最多的分类结果
# 其中iteritems()把字典分解为元祖列表,itemgetter(1)按照第二个元素的次序对元祖排序
sortedClassCount = sorted(classCount.iteritems(), \
key=operator.itemgetter(1), reverse=True)
print(sortedClassCount)
# 输出分类标签
#print(sortedClassCount[0][0])
return sortedClassCount[0][0]
# 读的是datingTestSet2.txt,不是datingTestSet.txt
file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'
if __name__== "__main__":
# 导入数据
group,labels=createDataset()
print('training data set:',group)
print('labels of training data set:',labels)
# 简单分类
tt=classify0([0,0],group,labels,3)
print('Classification results:',tt)
运行结果:
('training data set:', array([[ 1. , 1.1],
[ 1. , 1. ],
[ 0. , 0. ],
[ 0. , 0.1]]))
('labels of training data set:', ['A', 'A', 'B', 'B'])
('classCount:', {'A': 1, 'B': 2})
[('B', 2), ('A', 1)]
('Classification results:', 'B')
至此一个最简单的KNN分类就实现了
KNN算法改进约会网站的配对效果
数据的处理
会用到的语法:
matplotlib
min(iterable, *[, key, default])
# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt
# 数据预处理
def file2matrix(filename):
'''
从文件中读入训练数据,并存储为矩阵
'''
fr=open(filename,'r')
# 源代码有错误
arrayOfLines=fr.readlines() # 只能读一次
numberOfLines = len(arrayOfLines) # 得到样本的行数
# 得到一个二维矩阵,行数是样本的行数,每行3列
returnMat = zeros((numberOfLines,3))
print('row:%s and column:%s' %(returnMat.shape[0],returnMat.shape[1]))
classLabelVector = [] # 得到一个一维的数组,存放样本标签
index = 0
for line in arrayOfLines:
#strip() 方法用于移除字符串头尾指定的字符(默认为所有的空字符,包括空格、换行(\n)、制表符(\t)等)
line = line.strip() # 把回车符号给去掉
#对于每一行,按照制表符切割字符串,得到的结果构成一个数组,
listFromLine = line.split('\t')
#print(listFromLine[0:4])
# 把分割好的数据放至数据集,是一个1000*3的数组
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return ( returnMat,classLabelVector)
fr.close()
# 归一化数据
def autoNorm(dataSet):
# 每列的最小值minvals
minVals=dataSet.min(0) # 0表示返回每列的最小值
maxVals=dataSet.max(0)
ranges=maxVals-minVals
# 得到dataset相同行列数的0数组
normDataSet=zeros(shape(dataSet))
m = dataSet.shape[0] #数组的行数
# tile复制形如[A,B,C](ABC分别代表每列的最小值)m行
normDataSet = dataSet - tile(minVals, (m,1))
# 归一化公式,注意是具体特征值相除
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
# 分类测试
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('C:\Users\LiLong\Desktop\datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
# 测试数据的数量
numTestVecs = int(m*hoRatio)
print('the test number:',numTestVecs)
errorCount = 0.0
for i in range(numTestVecs):
#normMat[i,:]表示输入的测试集是前100行的数据,normMat[numTestVecs:m,:]表示训练集
#是100-1000的,datingLabels[numTestVecs:m]表示和训练集是对应的
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3)
print ("the classifier came back with: %d, the real answer is: %d"\
% (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
# 读的是datingTestSet2.txt,不是datingTestSet.txt
file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'
if __name__== "__main__":
# 格式化数据
datingDataMat,datingLables=file2matrix(file_raw)
print datingDataMat
print datingLables
#print(array(datingLables)) # 以数组的部分省略形式显示
# 创建散点图
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
# c是颜色的数目,s是尺寸
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],\ c=15.0*array(datingLables),s=15.0*array(datingLables))
plt.show()
# 数据归一化
normMat, ranges, minVals=autoNorm(datingDataMat)
print normMat
其中file2matrix得到的是数组矩阵,也即是可以处理的数据格式,如下:
[[ 4.09200000e+04 8.32697600e+00 9.53952000e-01]
[ 1.44880000e+04 7.15346900e+00 1.67390400e+00]
[ 2.60520000e+04 1.44187100e+00 8.05124000e-01]
...,
[ 2.65750000e+04 1.06501020e+01 8.66627000e-01]
[ 4.81110000e+04 9.13452800e+00 7.28045000e-01]
[ 4.37570000e+04 7.88260100e+00 1.33244600e+00]]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3]
下图是数据的散点图:
归一化后的数据:
[[ 0.44832535 0.39805139 0.56233353]
[ 0.15873259 0.34195467 0.98724416]
[ 0.28542943 0.06892523 0.47449629]
...,
[ 0.29115949 0.50910294 0.51079493]
[ 0.52711097 0.43665451 0.4290048 ]
[ 0.47940793 0.3768091 0.78571804]]
测试算法
# coding=utf-8
from numpy import *
import operator # 运算符模块,执行排序操作时将用到
import matplotlib.pyplot as plt
# 建立训训练集和相应的标签
def createDataset():
# 数组,注意此处是两个中括号
group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels=['A','A','B','B']
return (group,labels)
# 简单分类
def classify0(inX,dataSet,labels,k):
#shape[0]得到的是矩阵行数,shape[1]得到列数
dataSetSize=dataSet.shape[0]
# tile()得到和dataset相同的维数,进行相减
diffMat=tile(inX,(dataSetSize,1))-dataSet
# 各向量相减后平方
sqDiffMat = diffMat**2
# axis=1按行求和,得到了平方和
sqDistances = sqDiffMat.sum(axis=1)
# 开根号,求得输入向量和训练集各向量的欧氏距离
distances = sqDistances**0.5
# 得到各距离索引值,是升序,即最小距离到最大距离
sortedDistIndicies = distances.argsort()
classCount={} # 定义一个字典
for i in range(k):
# 前k个最小距离的标签
voteIlabel = labels[sortedDistIndicies[i]]
# 累计投票数
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
# 把分类结果进行排序,然后返回得票数最多的分类结果
# 其中iteritems()把字典分解为元祖列表,itemgetter(1)按照第二个元素的次序对元祖排序
sortedClassCount = sorted(classCount.iteritems(), \
key=operator.itemgetter(1), reverse=True)
# 输出分类标签
#print(sortedClassCount[0][0])
return sortedClassCount[0][0]
# 数据预处理
def file2matrix(filename):
'''
从文件中读入训练数据,并存储为矩阵
'''
fr=open(filename,'r')
# 源代码有错误
arrayOfLines=fr.readlines() # 只能读一次
numberOfLines = len(arrayOfLines) # 得到样本的行数
returnMat = zeros((numberOfLines,3)) # 得到一个二维矩阵,行数是样本的行数,每行3列
print('row:%s and column:%s' %(returnMat.shape[0],returnMat.shape[1]))
classLabelVector = [] # 得到一个一维的数组,存放样本标签
index = 0
for line in arrayOfLines:
#strip() 方法用于移除字符串头尾指定的字符(默认为所有的空字符,包括空格、换行(\n)、制表符(\t)等)
line = line.strip() # 把回车符号给去掉
#对于每一行,按照制表符切割字符串,得到的结果构成一个数组,
listFromLine = line.split('\t')
#print(listFromLine[0:4])
# 把分割好的数据放至数据集,是一个1000*3的数组
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return ( returnMat,classLabelVector)
fr.close()
# 归一化数据
def autoNorm(dataSet):
# 每列的最小值minvals
minVals=dataSet.min(0) # 0表示返回每列的最小值
maxVals=dataSet.max(0)
ranges=maxVals-minVals
# 得到dataset相同行列数的0数组
normDataSet=zeros(shape(dataSet))
m = dataSet.shape[0] #数组的行数
# tile复制形如[A,B,C](ABC分别代表每列的最小值)m行
normDataSet = dataSet - tile(minVals, (m,1))
# 归一化公式,注意是具体特征值相除
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
# 分类测试
def datingClassTest():
hoRatio = 0.10 #hold out 10%
datingDataMat,datingLabels = file2matrix('C:\Users\LiLong\Desktop\datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
# 测试数据的数量
numTestVecs = int(m*hoRatio)
print('the test number:',numTestVecs)
errorCount = 0.0
for i in range(numTestVecs):
#normMat[i,:]表示输入的测试集是前100行的数据,normMat[numTestVecs:m,:]表示训练集
#是100-1000的,datingLabels[numTestVecs:m]表示和训练集是对应的
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3)
print ("the classifier came back with: %d, the real answer is: %d"\
% (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
# 读的是datingTestSet2.txt,不是datingTestSet.txt
#file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt'
if __name__== "__main__":
datingClassTest()
结果:
row:1000 and column:3
('the test number:', 100)
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
...,
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
5.0
结果显示错误率5.0%