约会数据: https://github.com/pbharrin/machinelearninginaction/blob/master/Ch02/datingTestSet2.txt
代码如下:
import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt
#数据读取与预处理
def file2matrix(filename):
with open(filename) as f:
arrayOLines = f.readlines()
numberOfLines = len(arrayOLines)
#传进去的shape
returnMat = np.zeros((numberOfLines,3))
classLableVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLableVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLableVector
#归一化 newvalue = (prevalue - min)/(max - min)
def autoNorm(dataset):
minVals = dataset.min(0)
maxVals = dataset.max(0)
ranges = maxVals - minVals
#可以直接传进去一个array的shape用于构造新array
normDataset = np.zeros(dataset.shape)
#重复minvals这个array,n行1列
m = np.tile(minVals,(dataset.shape[0],1))
normDataset = dataset - m
normDataset = normDataset/np.tile(ranges,(dataset.shape[0],1))
return normDataset
# KNN 分类器
def classfy0KNN(intX,dataset,labels,K):
datasetSize = dataset.shape[0]
newX = np.tile(intX,(datasetSize,1))
diff = dataset - newX
sqrdiff = diff**2
sumSqrdiff = sqrdiff.sum(axis=1)
distance = sumSqrdiff**0.5
sortIndex = distance.argsort()
dictionary = {}
for n in range(K):
voteLable = labels[sortIndex[n]]
dictionary[voteLable] = dictionary.get(voteLable,0)+1
sortedClassCount = sorted(dictionary.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def datingClassTest():
dataset, labels = file2matrix('datingTestSet2.txt')
dataset = autoNorm(dataset)
hoRatio = 0.1
m = dataset.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0
#k 是一个超参数,经过实验取4为最佳
for i in range(numTestVecs):
predictClass = classfy0KNN(dataset[i,:],dataset[numTestVecs:,:],labels[numTestVecs:],4)
print('predict output is : ',predictClass,' the real is : ',labels[i])
if predictClass != labels[i]:
errorCount += 1
print('the total error rate is : ',(errorCount/numTestVecs))
if __name__ == '__main__':
datingClassTest()
# dataset,labels = file2matrix('datingTestSet2.txt')
# dataset = autoNorm(dataset)
# fig = plt.figure()
#参数349的意思是:将画布分割成3行4列,图像画在从左到右从上到下的第9块
# ax = fig.add_subplot(221)
#利用颜色和尺寸标识数据点的属性类别
# ax.scatter(dataset[:,0],dataset[:,1],15.0*np.array(labels),15.0*np.array(labels))
# plt.show()
# print(dataset)
# print(labels)
# intX = []
# predictLable = classfy0KNN(intX,dataset,labels,20)
运行结果: