1-前言
这个系列博客作为笔记用来记录学习这本书过程中遇到的问题
2-程序2-1:
1:导入必要的包
from numpy import *
import operator
2-规范dataSet
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
3-实现kNN算法
3.1- 实现核心部分,后面会调用这个函数
def classify(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0] #dataSet.shape 获取的结果是几行几列,shape[0]是几行
diffMat = tile(inX,(dataSetSize,1)) - dataSet #tile将inX变成 dataSize行
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1) #两列相加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #取得distances排序后的下标变换
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #记录出现了几次
sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
return sortedClassCount[0][0]
3.2- 将txt文件中的数据转化成matrix
def file2matrix(fileName):
fr = open(fileName)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() #去掉回车
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
3.3- 因为里程数数值比较大,这样会降低其他两个类别的权重,所以对整个数组归一化。
归一化公式为:newValue = (oldValue - min) / (max - min)
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals,(m,1))
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minVals
3.4- 测试分类器性能
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
#print(m)
numTestVecs = int(m*hoRatio) #将normMat中的数据以1:9划分,100到1000作为样本点,0到100作为测试点
#print(numTestVecs)
#print(normMat[numTestVecs:m,:])
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
#print(classifierResult)
print("the classifier came back with : " + str(classifierResult) + " ,the real answer is : " + str(datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is " + str(errorCount/float(numTestVecs)))
3.5- 对输入数据进行预测
def classifyPerson():
resultList = ['not at all','in small doses','in large doses']
percentTats = float(input("percentage of time spent playing video game?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr - minVals)/ranges,normMat,datingLabels,3)
print("You will probably like this person: " + resultList[classifierResult - 1])
注:其中几个函数的用法:
1:tile(x,(n,m)) 是将 x按照 n行,m列扩展 例如:
from numpy import *
x = [[1,2],[2,3]]
result = tile(x,(2,1))
print(result)
'''输出结果为
[[1 2]
[2 3]
[1 2]
[2 3]]
'''
2:distances.argsort() 是将distances排序后输出变换的下标
distances=array([5,2,4,1])
sortedDistances = distances.argsort()
print(sortedDistances)
'''输出结果
[3 1 2 0]
'''