目标:
- 不喜欢的人
- 魅力一般的人
- 极具魅力的人
from numpy import *
def file2matrix(filename):
fr=open(filename)
arrayOlines=fr.readlines()
numberOflines=len(arrayOlines) #返回列表长度
returnMat=zeros((numberOflines,3)) #生成ndarray数组
classLabelVector=[]
index=0
for line in arrayOlines: #遍历样本条
line=line.strip()#去掉字符串首尾指定字符,去掉回车符
listFromLine=line.split('\t')#将整行元素分割成元素列表
returnMat[index,:]=listFromLine[0:3]#定义X
classLabelVector.append(int(listFromLine[-1]))#定义y
index+=1
return returnMat,classLabelVector
(2)对数据进行可视化分析
type1_x=[]
type1_y=[]
type2_x=[]
type2_y=[]
type3_x=[]
type3_y=[]
for i in range(len(datingLabels)):
if datingLabels[i]==1:
type1_x.append(datingDataMat[i][1])
type1_y.append(datingDataMat[i][2])
elif datingLabels[i]==2:
type2_x.append(datingDataMat[i][1])
type2_y.append(datingDataMat[i][2])
elif datingLabels[i]==3:
type3_x.append(datingDataMat[i][1])
type3_y.append(datingDataMat[i][2])
import matplotlib
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
type1=ax.scatter(type1_x,type1_y,s=20,c='red')
type2=ax.scatter(type2_x,type2_y,s=30,c='blue')
type3=ax.scatter(type3_x,type3_y,s=40,c='yellow')
ax.legend((type1,type2,type3),('不喜欢','魅力一般','极具魅力'),loc=0) ##0表示自动寻找最佳位置
matplotlib.rcParams['font.family']='simHei'
#matplotlib不能直接在图中输入中文,通过更改配置文件将字体改为简体黑体,matplotlib.rc_params查看配置文件,配置文件以字典的形式储存
plt.xlabel('玩视频游戏所耗时间百分比')
plt.ylabel('每周消耗的冰淇淋公斤数')
plt.show()
![](https://i-blog.csdnimg.cn/blog_migrate/2f1dff820a96f9e44f8a1414f13bc0dc.png)
(3)数据的量纲造成了变量的权重不同,因此对变量进行标准化
newValue=(oldValue-min)/(max-min)
def autoNorm(dataSet):
minVals=dataSet.min(0) #选取列的最小值
maxVals=dataSet.max(0)
ranges=maxVals-minVals
normDataSet=zeros(shape(dataSet))
m=dataSet.shape[0]
normDataSet=dataSet-tile(minVals,(m,1))
normDataSet=normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minVals
(4)测试knn分类效果
def datingClassTest():
hoRatio=0.1 #测试集的数量为总样本的10%
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
m=normMat.shape[0]
numTestVecs=int(m*hoRatio) #测试集的个数
errorCount=0.0
for i in range(numTestVecs):
classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3)
print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if classifierResult != datingLabels[i]:
errorCount+=1
print( "the total error rate is: %f" % (errorCount/float(numTestVecs)) )
(4)构建完整可用系统(输入信息给出她对对方喜欢程度的预测值)
def classifyPerson():
resultList=['not in all','in small doses','in large doses']
percentTats=float(input('percentage of time spent playing video games?'))
ffMiles=float(input('frequent flier miles earned per year?')) ##python3将raw_input和input合并了
iceCream=float(input('liters of ice cream consumed per year?'))
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
normMat,ranges,minVals=autoNorm(datingDataMat)
inArr=array([ffMiles,percentTats,iceCream]) #训练街样本是nd.array形式
classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print('you will probably like this person:',resultList[classifierResult-1]) #classifierResult得到的结果是1,2,3