#coding=utf-8
from numpy import *
import operator
import string
import matplotlib.pyplot as plt
import numpy as np
#创建数据集
def createDataSet():
group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels=['A','A','B','B']
return group,labels
#KNN算法实现
def knn_classify0(inx,dataSet,labels,k):
#inx 用于分类的输入向量
#dataSet 输入的训练样本集
#标签 labels
#k 最近邻居的数目
dataSetSize= dataSet.shape[0]
#tile函数是重复函数
diffmat = tile(inx, (dataSetSize,1)) - dataSet
sqdiffmat = diffmat**2
#对数组按行求和
sqdistance = sqdiffmat.sum(axis=1)
distance=sqdistance**0.5
#对数组按列排序,返回索引值
sortedDisIndex = distance.argsort(axis=0)
#建立字典
classCount={}
for i in range(k):
votelabel=labels[sortedDisIndex[i]]
#字典的get方法(a,b),如果字典里没有此key返回b,有返回key_value。不断的累积标签对应的数值
classCount[votelabel]=classCount.get(votelabel,0)+1
sortedclassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedclassCount[0][0]
def file2matrix(filename):
fr=open(filename)
#按行读取文件
arrayoLines=fr.readlines()
numberOfLines=len(arrayoLines)
#根据数据定返回的数组行列数
returnMat=zeros((numberOfLines,3))
classLabelVector=[]
index=0
#按行遍历,构造数组
for line in arrayoLines:
#截掉所有回车字符
line=line.strip()
#将文本以TAB分割
listFromLine=line.split('\t')
returnMat[index,:]=listFromLine[0:3]
classLabelVector.append((listFromLine[-1]))
index=index+1
return returnMat,classLabelVector
# DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt')
# group,label=createDataSet()
# result0=knn_classify0([3,3.5], group, label, 3)
#可视化
DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt')
fig=plt.figure()
ax =fig.add_subplot(211)
ax.scatter(DataMat[:,1],DataMat[:,2])
# plt.show()
#归一化函数
def autoNorm(dataset):
minValue = dataset.min(0)
maxValue = dataset.max(0)
rangeValue = maxValue - minValue
m=dataset.shape[0]
normData = zeros(shape(dataset))
normData = dataset - tile(minValue,(m, 1))
normData = normData/tile(rangeValue, (m,1))
return normData,rangeValue,minValue
def datingClassTest():
#选取测试集比例
hoRatio = 0.1
DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt')
normMat,ranges,minvalues=autoNorm(DataMat)
m=normMat.shape[0]
numOfTest=int(m*hoRatio)
errorCount=0.0
for i in range(numOfTest):
classiferResult = knn_classify0(normMat[i,:], normMat[numOfTest:m,:], DataLabel[numOfTest:m], 3)
print (classiferResult) ,(DataLabel[i])
if(classiferResult!=DataLabel[i]):
errorCount=errorCount + 1
print "the total error rate is : %f" %(errorCount/float(numOfTest))
#datingClassTest()
#搭建一个用户输入飞行里数,游戏时长,吃冰淇淋就可以匹配自己是否感兴趣的系统
def classifyPerson():
resultList=['not at all','in small doses','in large doses']
ffmile = float(raw_input("flier miles per year?"))
percentGame = float(raw_input("percentage of time spent playing video game?"))
iceCream = float(raw_input("ice cream consumed per year?"))
DataMat,DataLabel=file2matrix('D:\learn\Ch02\datingTestSet.txt')
normMat,ranges,minvalues=autoNorm(DataMat)
inx = array([ffmile,percentGame,iceCream])
classiferResult = knn_classify0((inx-minvalues)/ranges, normMat, DataLabel, 3)
print classiferResult
classifyPerson()