1.处理文本数据:
def file2matrix(filename):
fr = open(filename)
arrayOlines = fr.readlines()
numberOlines = len(arrayOlines)
returnMat = zeros( (numberOlines, 3) )
classLabelVector = []
index = 0
for line in arrayOlines:
line = line.strip()
listFromline = line.split('\t')
returnMat[index, :] = listFromline[0: 3] #列表的值可以直接赋给矩阵,且在矩阵中是数值类型
classLabelVector.append(listFromline[-1]) #不要写成classLabelVector = classLabelVector.append()
index += 1
2.数据归一化:
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1))
return normDataSet, ranges, minVals
3.测试准确性:
def datingClassTest(): #以前100个数据为测试数据,后900个为样本数据
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print('the classifier came back with: %s, the real answer is %s' %(classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is: %f" %(errorCount / float(numTestVecs)))
4.预测:
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float( input('percentage of time spent playing video games?') )
ffMiles = float( input('frequent flier miles earned per year?') )
iceCream = float( input('liters of ice cream consumed per year?') )
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array( [ffMiles, percentTats, iceCream] )
classifierResult = classify0( (inArr - minVals) / ranges, normMat, datingLabels, 3 )
print('You will probably like this person:', resultList[int(classifierResult) - 1])