在上一篇的基础上增加如下代码:
'''
将文本记录转换到NumPy的解析程序
输入为文件名字符串
输出为训练样本矩阵和类标签向量
'''
def file2matrix(filename):
fr = open(filename)
arrayOLine = fr.readlines()
numberOfLines = len(arrayOLine) #得到文本行数
returnMat = zeros((numberOfLines, 3)) #创建以0填充的NumPy矩阵
'''
解析文本数据到列表,文本数据有4列,分别表示
每年获得的飞行常客里程数
玩视频游戏所消耗的时间百分比
每周消费的冰淇淋公升数
标签,以整型表示:不喜欢的人,魅力一般的人,极具魅力的人
'''
classLabelVector = []
index = 0
for line in arrayOLine:
line = line.strip() #strip,默认删除空白符(包括'\n', '\r', '\t', ' ')
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0: 3] #选取前3个元素存储到特征矩阵
classLabelVector.append(int(listFromLine[-1])) #-1表示最后一列元素,如果不用int(),将当做字符串处理
index += 1
return returnMat, classLabelVector
#归一化特征值
def autoNorm(dataSet):
minVals = dataSet.min(0) #存放每一列的最小值,min(0)参数0可以从列中选取最小值,而不是当前行最小值
maxVals = dataSet.max(0) #存放每一列的最大值
ranges = maxVals - minVals #1 * 3 矩阵
normDataSet = zeros(shape(dataSet)) #列
m = dataSet.shape[0] #行
normDataSet = dataSet - tile(minVals, (m, 1)) #tile(A, (row, col))
normDataSet = normDataSet/tile(ranges, (m, 1))
return normDataSet, ranges, minVals
#分类器针对约会网站的测试代码
def dataingClassTest():
hoRatio = 0.1
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio) #用于测试的数据条数
errorCount = 0.0 #错误率
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m], 3)
print "the classifier came back with: %d, the real answer is: %d"\
%(classifierResult, datingLabels[i])
if(classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" %(errorCount/float(numTestVecs))
测试:
... ...
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000
错误率为5%
添加以下函数,进行预测
#约会网站预测函数
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(raw_input("percentage of time spent playing video games?"))
ffMiles = float(raw_input("frequent flier miles earned per year?"))
iceCream = float(raw_input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
print "You will probably like this person:", resultList[classifierResult-1]
>>> import KNN
>>> classifyPerson()
percentage of time spent playing video games?20
frequent flier miles earned per year?10000
liters of ice cream consumed per year?0.6
You will probably like this person: in large doses