kNN_约会网站匹配效果改进
【准备数据】数据处理函数
import numpy as np
import os
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(label2int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def label2int(labelName):
if labelName == 'didntLike':
return 0
elif labelName == 'smallDoses':
return 1
elif labelName == 'largeDoses':
return 2
datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
【分析数据】绘制数据散点图
import matplotlib
import matplotlib.pyplot as plt
def arrColor(labels):
arrColor = []
for i in datingLabels:
if i == 0:
arrColor.append('r')
elif i == 1:
arrColor.append('y')
elif i == 2:
arrColor.append('g')
return arrColor
fig = plt.figure(figsize=(8,20))
#plt.axis([-1,22,-0.1,1.8])
ax1 = fig.add_subplot(311)
ax1.scatter(datingDataMat[:,0],datingDataMat[:,1],c = arrColor(datingLabels))
ax2 = fig.add_subplot(312)
ax2.scatter(datingDataMat[:,1],datingDataMat[:,2],c = arrColor(datingLabels))
ax3 = fig.add_subplot(313)
ax3.scatter(datingDataMat[:,0],datingDataMat[:,2],c = arrColor(datingLabels))
plt.show()
由数据两两对比的三幅散点图分布可知,取第一列和第二列为x,y轴绘制散点图(图一)时,三种类型的人基本分属于不同的区域。
注:用scatter绘制散点图时,当数据在列表中未分类时,无法按照颜色给出图例。
想要显示图例,需对数据进行分类,然后分别用不同的scatter绘制,则可有不同分类的图例。
import matplotlib.font_manager as fm
myfont = fm.FontProperties(fname='C:/Windows/Fonts/msyh.ttf')
def showClassify(datingDataMat,datingLabels,x,y,x_name='',y_name=''):
type1_x = []
type1_y = []
type2_x = []
type2_y = []
type3_x = []
type3_y = []
for i in range(len(datingLabels)):
if datingLabels[i] == 0:
type1_x.append(datingDataMat[i][x])
type1_y.append(datingDataMat[i][y])
if datingLabels[i] == 1:
type2_x.append(datingDataMat[i][x])
type2_y.append(datingDataMat[i][y])
if datingLabels[i] == 2:
type3_x.append(datingDataMat[i][x])
type3_y.append(datingDataMat[i][y])
fig = plt.figure()
plt.xlabel(x_name,fontproperties=myfont)
plt.ylabel(y_name,fontproperties=myfont)
#plt.title("pythoner.com",fontproperties=myfont)
ax = fig.add_subplot(111)
type1 = ax.scatter(type1_x,type1_y,c = 'r')
type2 = ax.scatter(type2_x,type2_y,c = 'y')
type3 = ax.scatter(type3_x,type3_y,c = 'g')
ax.legend((type1, type2, type3), (u'不喜欢', u'魅力一般', u'极具魅力'),loc=2,prop=myfont)
plt.show()
showClassify(datingDataMat,datingLabels,0,1,u'每年获取的飞行常客里程数',u'玩视频游戏所耗时间百分比')
【准备数据】归一化特征值函数
def autoNorm(dataSet):
minValues = dataSet.min(0)
maxValues = dataSet.max(0)
ranges = maxValues - minValues
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minValues,(m,1))
normDataSet = normDataSet / np.tile(ranges,(m,1))
return normDataSet, ranges, minValues
normMat, ranges, minValues = autoNorm(datingDataMat)
array([[ 0.44832535, 0.39805139, 0.56233353],
[ 0.15873259, 0.34195467, 0.98724416],
[ 0.28542943, 0.06892523, 0.47449629],
...,
[ 0.29115949, 0.50910294, 0.51079493],
[ 0.52711097, 0.43665451, 0.4290048 ],
[ 0.47940793, 0.3768091 , 0.78571804]])
k-近邻算法
import operator
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
classify0([ 0.28542943, 0.06892523, 0.47449629],normMat,datingLabels,5)
0
【测试算法】测试错误率函数
def datingClassTest():
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
normMat, ranges, minValues = autoNorm(datingDataMat)
m=normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:,:],datingLabels[numTestVecs:],5)
if(classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
datingClassTest()
the total error rate is: 0.040000
【使用算法】采集数据并输出预测结果
def classifyPerson():
resultList = ['not at all','in small doses','in large doses']
percentTats = float(input("percentage of thime spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
normMat, ranges, minValues = autoNorm(datingDataMat)
inArr = np.array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minValues)/ranges,normMat,datingLabels,5)
print("You will probably like this person:",resultList[classifierResult])
classifyPerson()
percentage of thime spent playing video games?8
frequent flier miles earned per year?40000
liters of ice cream consumed per year?0.95
You will probably like this person: in large doses