参考
测试代码:
import numpy as np
import operator
import matplotlib.pyplot as plt
def fileReader(filename):
fr = open(filename)
numlines = len(fr.readlines())
returnMat = np.zeros((numlines, 3))
fr = open(filename)
index = 0
listLabels = []
for line in fr.readlines():
line = line.strip()
listFromlines = line.split('\t')
returnMat[index, :] = listFromlines[0:3]
index += 1
listLabels.append(int(listFromlines[-1]))
return returnMat, listLabels
def autoNorm(dataSet):
minVal = dataSet.min(0)
maxVal = dataSet.max(0)
ranges = maxVal - minVal
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVal, (m, 1))
normDataSet = normDataSet / np.tile(ranges, (m, 1))
return normDataSet
def classfy(testdata, dataSet, labels, k):
datasize = dataSet.shape[0]
diffMat = np.tile(testdata, (datasize, 1)) - dataSet
sqDiff = diffMat ** 2
sqDistance = sqDiff.sum(axis=1)
distance = sqDistance ** 0.5
sortedDistIndicies = distance.argsort()
classCount = {}
for i in range(k):
voteLabel = labels[sortedDistIndicies[i]]
classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
sortclassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortclassCount[0][0]
if __name__ == "__main__":
dataSet, dataLabel = fileReader('D:\Leaarn\MLtest\KNN\data\datingSet.txt')
normSet = autoNorm(dataSet)
hoRatio = 0.1
m = normSet.shape[0]
numTest = int(hoRatio * m)
errorCount = 0.0
for i in range(numTest):
classifierResult = classfy(normSet[i, :], normSet[numTest:m, :], dataLabel[numTest:m], 3)
print "the classifier answer: %d, the real answer is: %d" %(classifierResult, dataLabel[i])
if (classifierResult!=dataLabel[i]):
errorCount += 1.0
print "The total error rate is : %f" %(errorCount/float(numTest))