实施kNN算法
相关函数
from numpy import *
import operator #本模块主要包括一些Python内部操作符对应的函数
import matplotlib
import matplotlib.pyplot as plt
from os import listdir
# import py_compile
# py_compile.compile('MLIAkNN.py') #Python 中的pyc文件的用途---http://blog.csdn.net/yu132563/article/details/40922049
#http://blog.csdn.net/carolzhang8406/article/details/6342174
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) #array数组
labels = ['A', 'A', 'B', 'B'] #List
return group, labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]#shape函数可以查看矩阵或者数组的维数,[0]为行数,即样本数
diffMat = tile(inX, (dataSetSize,1))-dataSet #tile为重复inX,按照后面元组的数
sqDiffMat = diffMat**2 #矩阵的各个元素求平方
sqDistance = sqDiffMat.sum(axis=1) #axis=1表示针对某一行求和(把所有列加起来)
distance = sqDistance**0.5 #对每一个元素开平方
sortedDistIndicies = distance.argsort()#argsort函数返回的是数组值从小到大的索引值
classCount={} #Map
for i in range(k): #取前k个
votelabel = labels[sortedDistIndicies[i]] #局部变量
classCount[votelabel] = classCount.get(votelabel,0)+1
# get() 函数返回指定键的值,如果值不在字典中返回默认值,此处默认值为0
sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1)
,reverse=True)
#sorted返回一个重新排序的Map
#reverse=True表示倒序,从大到小;classCount.iteritems()迭代器,
#operator模块提供的itemgetter函数用于获取对象的哪些维的数据,此处为第1维,即按照
#第二个元素的值对元组进行排序,此处即标签累加个数
#sortedClassCount还是一个Map
return sortedClassCount[0][0] #返回最大一个的key
测试代码
import MLIAkNN
group, labels = MLIAkNN.createDataSet()
print group
print labels
print MLIAkNN.classify0([0,0],group,labels,3)
结果
[[ 1. 1.1]
[ 1. 1. ]
[ 0. 0. ]
[ 0. 0.1]]
['A', 'A', 'B', 'B']
B
约会数据可视化
相关函数
#读取及解析文本文件,返回训练样本矩阵和类标签向量
def file2matrix(filename):
fr = open(filename)
arrayLines = fr.readlines()#readlines() 方法用于读取所有行(直到结束符 EOF)并返回列表
numberLine = len(arrayLines)
returnMat = zeros((numberLine,3)) #3维特征 返回来一个给定形状和类型的用0填充的数组,
#returnMat元素在这里已经有数据类型了
#(numberLine,3)为元组,shape
classLabelVector = [] #列表
index = 0
for line in arrayLines:
line = line.strip() #strip()同时去掉左右两边的空格
listFromLine = line.split('\t')#根据制表符进行分割
returnMat[index,:] = listFromLine[0:3] #注意这里不包括3
classLabelVector.append(int(listFromLine[-1])) #-1表示最后一个元素
index +=1
return returnMat, classLabelVector
测试代码
#图像化观察
fig = plt.figure()
ax = fig.add_subplot(111) #111表示1行1列第一个
ax.scatter(datingDataMat[:,1],datingDataMat[:,2], 15.0*array(map(int,datingLabels)),
15.0*array(map(int,datingLabels))) #第2,3列数据
plt.show()
结果
实际测试
相关函数
#归一化处理
def autoNorm(dataSet):
minVals = dataSet.min(0) #这里的0表示针对每一列,取所有行的最小值,minVals是数组
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))#矩阵有一个shape属性,是一个(行,列)形式的元组
m = dataSet.shape[0] #行数
normDataSet = dataSet - tile(minVals, (m,1)) #原来是一个行向量,现在复制m行
normDataSet = normDataSet/tile(ranges, (m,1)) #没问题 /特征值相除
return normDataSet, ranges, minVals
# normMat, ranges, minVals = autoNorm(datingDataMat)
# print normMat
# print ranges
# print minVals
#测试算法函数,90%作为训练集,10%作为测试集
def datingClassTest():
hoRatio = 0.10 #hold out 10%
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
测试代码
#数据标准化处理
normMat, ranges, minVals = MLIAkNN.autoNorm(datingDataMat)
print normMat
print ranges
print minVals
#测试算法
MLIAkNN.datingClassTest()
结果
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
5.0
具体使用分类器来进行预判 喜欢度
相关函数
#使用算法:构建完整可用系统
#Python 每行代码太长了(PEP8中建议为80个字符)会用 \来换下一行接着写,突出逻辑或者避免出现滚动条,忽略就是了。
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
ffMiles = float(raw_input("frequent flier miles earned per year?"))
percentTats =float(raw_input("percentage of time spent playing video games?"))
iceCream = float(raw_input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print "you will probably like this person:", resultList[classifierResult-1]
测试代码
#具体使用
MLIAkNN.classifyPerson()
结果
frequent flier miles earned per year?10000
percentage of time spent playing video games?10
liters of ice cream consumed per year?0.5
you will probably like this person: in small doses
手写数字识别
相关函数
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits') #load the training set
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount/float(mTest))
测试代码
#手写数字识别
MLIAkNN.handwritingClassTest()
结果
he classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the classifier came back with: 9, the real answer is: 9
the total number of errors is: 11
the total error rate is: 0.011628