目的:分类对象为:
1.不喜欢的人 'not at all'
2.魅力一般的人 'in small doses'
3.极具魅力的人 'in large doses'
1. kNN算法代码:
def classify0(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
diffMat=tile(inX,(dataSetSize,1))-dataSet
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1)
distances=sqDistances**0.5
sortedDistIndicies=distances.argsort()
classCount={}
for i in range(k):
voteIlabel=labels[sortedDistIndicies[i]]
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
2. 数据处理:将文本文件转为Numpy矩阵:文本数据来源
def file2matrix(filename):
love_dictionary={'largeDoses':3, 'smallDoses':2, 'didntLike':1}
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
if(listFromLine[-1].isdigit()):
classLabelVector.append(int(listFromLine[-1]))
else:
classLabelVector.append(love_dictionary.get(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
由于特征之间差距太大,会导致极大的预测误差,要进行归一化处理:
def autoNorm(dataSet):
minVals=dataSet.min(0)
maxVals=dataSet.max(0)
ranges=maxVals-minVals
normDataSet=zeros(shape(dataSet))
m=dataSet.shape[0]
normDataSet=dataSet-tile(minVals,(m,1))
normDataSet=normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minVals
3. kNN分类器针对约会网站测试代码:
def datingClassTest():
hoRatio=0.10
datingDataMat,datingLabels=file2matrix('/Users/xxx/Downloads/machinelearninginaction-master/Ch02/datingTestSet.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
m=normMat.shape[0]
numTestVecs=int(m*hoRatio)
errorCount=0.0
for i in range(numTestVecs):
classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print("the classifier came back with:%d, the real answer is:%d" %(classifierResult,datingLabels[i]))
if(classifierResult!=datingLabels[i]):
errorCount+=1.0
print("the total error rate is:%f" %(errorCount/float(numTestVecs)))
最后得到错误率为5%
4. 约会网站预测函数:
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('/Users/xxx/Downloads/machinelearninginaction-master/Ch02/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream, ])
classifierResult = classify0((inArr - minVals)/ranges, normMat, datingLabels, 3)
print("You will probably like this person: %s" % resultList[classifierResult - 1])