机器学习实战
第二章
2.2节 KNN约会配对
#!/usr/bin/python
# -*- coding: utf-8 -*-
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group ,labels
def classify0( inX,dataSet,labels,k ): # 待判定输入,训练样本,样本标签,kNN中k的取值
dataSetSize = dataSet.shape[0] # shape 返回矩阵的维度
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # tile(a,b):将a重复b次
saDiffMat = diffMat**2 # 运算符模块:** 平方
sqDistances = saDiffMat.sum(axis=1) # axis=1: array按行求和 axis=0:array按列求和
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() # 返回数组值从大到小的索引值
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # dic.get() 访问不存在的键时,自动添加并设值为默认值
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) # dic.iteritems() :迭代器函数 operator.itemgetter():获取对象的指定域的值 key:进行比较的关键字
return sortedClassCount[0][0]
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines() # 将文件全部读入字符串列表,每个字符串为一行
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3)) # numpy.zeros() 创建零矩阵
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() # 除去两边的空格
listFromLine = line.split('\t') # '\t' = Tab split:拆解字符串
returnMat[index ,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1])) # 列表添加元素
index += 1
return returnMat, classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0) # numpy.array.min(0) 返回每列的最小值
maxVals = dataSet.max(0) # numpy.array.max(0) 返回每列的最大值
ranges = maxVals - minVals
# normDataSet = zeros(dataSet.shape) # 利用shape属性,建同样大小的矩阵
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1)) # array全部减去最小值
normDataSet = normDataSet/tile(ranges, (m, 1)) # array全部除去范围值
return normDataSet, ranges, minVals
def datingClassTest():
hoRatio = 0.1 # 测试样例比例
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio) # 测试数据集
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d,the real answer is:%d" %(classifierResult, datingLabels[i])
if(classifierResult != datingLabels[i]):
errorCount += 1.0
print "the total error rate is :%f" %(errorCount/float(numTestVecs))
def classifyPerson():
resultList = ['not at all ','in small doses ','in large doses ']
percetTates = float(raw_input('percent of time spent playing video game?'))
ffMiles = float(raw_input('frequent flier miles earned per year?'))
iceCream = float(raw_input('liters of ice cream consumed per year?'))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles,percetTates,iceCream]) # 构建分类目标array
inArr = (inArr-minVals)/ranges
classifierResult = classify0(inArr, normMat,datingLabels,3)
print 'you will like this person:',resultList[classifierResult-1]
classifyPerson()
#datingClassTest()
#datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
#normMat,ranges,minVals = autoNorm(datingDataMat)
#fig = plt.figure()
#ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0*array(datingLabels),15.0*array(datingLabels)) # 利用标签属性标记
# fig.show() 显示后自动退出
#plt.show() #显示不退出
#group,labels = createDataSet()
#result = classify0([0,0],group,labels,3)
#pass