# 机器学习实战python版归一化数值和测试kNN算法以及构建完整可用系统

## 归一化数值：

def autoNorm(dataSet):
minVals = dataSet.min(0)      #每一列的最小值
maxVals = dataSet.max(0)      #每一列的最大值
ranges = maxVals - minVals    #幅度
normDataSet = zeros(shape(dataSet)) #创建一个一样规模的零数组
m = dataSet.shape[0]          #取数组的行
normDataSet = dataSet - tile(minVals, (m,1))#减去最小值
normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
#再除以幅度值，实现归一化，tile功能是创建一定规模的指定数组
return normDataSet, ranges, minVals

import matplotlib
import matplotlib.pyplot as plt
import kNN

datingDataMat,datingLabels = kNN.file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = kN.autoNorm(datingDataMat)

>>> normMat
array([[ 0.44832535,  0.39805139,  0.56233353],
[ 0.15873259,  0.34195467,  0.98724416],
[ 0.28542943,  0.06892523,  0.47449629],
...,
[ 0.29115949,  0.50910294,  0.51079493],
[ 0.52711097,  0.43665451,  0.4290048 ],
[ 0.47940793,  0.3768091 ,  0.78571804]])
>>> ranges
array([  9.12730000e+04,   2.09193490e+01,   1.69436100e+00])
>>> minVals
array([ 0.     .001156])

s
array([ 0.     .001156])


## 测试算法：

def datingClassTest():
hoRatio = 0.50      #hold out 10%
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)#前多少行为测试数据
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
#normMat[i,:]即为前numTestVecs行的数据，后面为比对数据
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
#如果测试数据的结果和数据资料中应有的结果不一致则错误标记加一。
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the total error rate is: 0.064000
32.0

## 使用算法：

def classifyPerson():
resultList = ['not at all','in small doses', 'in larfe doses']
percentTats = float(raw_input("percentage of time playing video games?"))
ffMiles = float(raw_input("frequent flier miles earned per years?"))
iceCream = float(raw_input("liters of ice cream consumed per years?"))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print "you will probably like this person: ",resultList[classifierResult -1]

>>> percentage of time playing video games?10 frequent flier miles earned per years?10000 liters of ice cream consumed per years?0.5 you will probably like this person: in small doses `

03-08 701

02-01 494

01-25 155

11-26 1720

03-30 1220

12-16 1.4万

05-18 1787

02-11 2万

12-20 1.5万

08-26 7629