1.准备数据:从文本文件中解析数据
def file2matrix(filename): #输入:文本文件名字符串;输出:训练样本矩阵和类标签向量
fr = open(filename)
arrayOlines = fr.readlines()
numberOfLines = len(arrayOlines) #得文件行数
returnMat = zeros((numberOfLines,3)) #创建以0填充的Numpy矩阵
classLabelVector = []
index = 0
for line in arrayOlines: #解析文件数据到列表
line = line.strip() #删除回车字符
listFromLine = line.split('\t')#用tab字符将上一步得到的整行数据分割成元素列表
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))#将列表最后一列存到里面
index += 1
return returnMat,classLabelVector
import sys
sys.path.append('/home/yang/Software/pycharm-community-2017.3.4/bin/桌面/PycharmProjects/untitled/k18')
import kNN
reload(kNN)
datingDataMat,datingLabels = kNN.file2matrix(r'/home/yang/Software/pycharm-community-2017.3.4/bin/桌面/PycharmProjects/untitled/k18/datingTestSet2.txt'
检查数据内容:
datingDataMat
2.分析数据:使用Matplotlib创建散点图
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure() #建立图纸
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,2])
plt.show()
画出的图没有记号 不好观察。改上面的一句为:
from numpy import * #这一步是关键
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))
仍不能容易的得出结论,无法区分
(我们利用颜色尺寸标识了数据点的属性类别,因而我们基本上可以从上图上看到数据点所属三个样本分类。上图使用的矩阵属性列0个1展示数据,虽然可以区别,但是区分度不高。
下面我们使用每年赢得的飞行常客里程数与玩视频游戏所占百分比的约会数据散点图,约会数据有三个特征,通过下图的展示的两个特征更容易区分数据点从属的类别)尝试采用列1 2的属性值得出结果:
3.准备数据:归一化数值
处理不同取值范围的特征值,需要归一化
如将取值范围处理为0~1之间:
newValue = (oldValue-min)/(max-min)
其中,max和min分别代表数据集中的最大特征值和最小特征值。虽然改变数值取值范围增加了分类器的复杂度,但为了得到准确结果,我们必须这样做。我们需要在文件kNN.py中增加了一个新函数autopNorm(),该函数可以将数字特征值转换为0-1区间。autoNorm()的代码如下:
def autoNorm(dataSet): # 归一化特征值
minVals = dataSet.min(0) #每列的最小值,0表示列,1x3
maxVals = dataSet.max(0)
ranges = maxVals - minVals # 1x3
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0] #dataSet行数
normDataSet = dataSet - tile(minVals, (m,1)) #将最小值向量复制,与dataDet同维数
normDataSet = normDataSet/tile(ranges, (m,1)) #具体特征值相除。矩阵除法:linalg。solve(matA,matB)
return normDataSet, ranges, minVals
测试:
reload(kNN)
Out[43]: <module 'kNN' from '/home/yang/Software/pycharm-community-2017.3.4/bin/桌面/PycharmProjects/untitled/k18/kNN.py'>
normMat, ranges, minVals = kNN.autoNorm(datingDataMat)
normMat
Out[45]:
array([[ 0.44832535, 0.39805139, 0.56233353],
[ 0.15873259, 0.34195467, 0.98724416],
[ 0.28542943, 0.06892523, 0.47449629],
...,
[ 0.29115949, 0.50910294, 0.51079493],
[ 0.52711097, 0.43665451, 0.4290048 ],
[ 0.47940793, 0.3768091 , 0.78571804]])
ranges
Out[46]: array([ 9.12730000e+04, 2.09193490e+01, 1.69436100e+00])
minVals
Out[47]: array([ 0. , 0. , 0.001156])
4.测试算法:作为完整程序验证分类器
def datingClassTest():#测试分类器效果函数 自包含函数
hoRatio = 0.1
datingDataMat, datingLabels = file2matrix(r'/home/yang/Software/pycharm-community-2017.3.4/bin/桌面/PycharmProjects/untitled/k18/datingTestSet2.txt') #提取数据
normMat, ranges, minVals, = autoNorm(datingDataMat) #归一化特征值
m = normMat.shape[0] #计算测试向量和训练样本的数量
numTestVecs = int(m*hoRatio) #测试向量的数量 10%
errorCount = 0.0 #计数器
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3) #inX, dataSet, labels, k)
print('the classifier came back with:%d,the real answer is:%d'\
%(classifierResult,datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is :%f"%(errorCount/float(numTestVecs))) #计算错误率
结果:
the total error rate is :0.050000
5.使用算法:构建完整可用系统
def classifyPerson():
resultList = ['not at all','in small doses','in large doses']
percentTats = float(raw_input("percentage of time spent playing video games?"))
ffMiles = float(raw_input("frequent flier miles earned per year?"))
iceCream = float(raw_input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix(r'/home/yang/Software/pycharm-community-2017.3.4/bin/桌面/PycharmProjects/untitled/k18/datingTestSet2.txt') # 提取数据
normMat, ranges, minVals, = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat,datingLabels, 3)
print("you will probably like this person:",resultList[classifierResult - 1])
raw_input()是 让用户输入文本行命令并返回用户所输入的命令
测试:
percentage of time spent playing video games?>? 10
frequent flier miles earned per year?>? 10000
liters of ice cream consumed per year?>? 0.5
('you will probably like this person:', 'in small doses')