1,收集数据:提供文本文件
2,准备数据:使用Python解析文本文件
3,分析数据:使用Matplotlib画二维扩散图
4,训练算法:此步骤不适用于该算法
5,测试算法
6,使用算法:产生简单的命令行程序,然后输入一些特征数据以判断对方是否为自己喜欢的类型
from numpy import * import operator#运算符模块 def createDataSet(): group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels=['A','A','B','B'] return group,labels def classify0(inX,dataSet,labels,k):#分类程序 dataSetSize=dataSet.shape[0] diffMat=tile(inX,(dataSetSize,1))-dataSet#tile()产生以inX dataSetSize x 1 的数组再减去dataSet sqDiffMat=diffMat**2 sqDistances=sqDiffMat.sum(axis=1)#每个数组的和 distances=sqDistances**0.5 sortedDistIndices=distances.argsort()#按照升序排列返回对应索引值 classCount={} for i in range(k): voteIlabel=labels[sortedDistIndices[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#classCount.iteritems 作用:{'k':1}转换为列表单位为一个[('k', 1)] return sortedClassCount[0][0] #以第二个元素为值,reverse=True做从大到小排序,最后返回一个排序好的列表 def filesmatrix(filename):#将文本记录转换为numpy fr = open(filename) arrayOLines=fr.readlines()#为一个列表元素为每一行的数据 numberOfLines=len(arrayOLines)#列表的长度即为行数 returnMat=zeros((numberOfLines,3))#创建矩阵 classLabelVector=[] index=0 for line in arrayOLines: line = line.strip()#用来移除字符串头尾的空白 listFromLine=line.split('\t')#按照\t分割字符串 returnMat[index,:]=listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index+=1 return returnMat,classLabelVector def autoNorm(dataSet):#归一化数据 minVals=dataSet.min(0)#在列中选出最小值 maxVals=dataSet.max(0)#在列中选出最大值 ranges=maxVals-minVals normDataSet=zeros(shape(dataSet)) m=dataSet.shape[0] normDataSet=dataSet-tile(minVals,(m,1)) normDataSet=normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals def datingClassTest(): hoRatio=0.10 datingDataMat,datingLabels=filesmatrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) m=normMat.shape[0] numTestVecs=int(m*hoRatio) errorCount=0.0 for i in range(numTestVecs): classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:], datingLabels[numTestVecs:m],3) print("the classifier came back with: %d,the real answer is:%d" %(classifierResult,datingLabels[i])) if (classifierResult != datingLabels[i]):errorCount+=1.0 print("the total error rate is :%f"%(errorCount/float(numTestVecs))) print(numTestVecs) print (datingLabels) def classifyPerson(): resultList=['not at all','in small doses','in large doses'] percentTats=float(raw_input( "percenrage of time spent playing video games?")) ffMiles=float(raw_input("frequent filer miles earnde per year?")) iceCreams=float(raw_input("liters of ice cream consumed per year?")) datingDataMat, datingLabels = filesmatrix('datingTestSet2.txt') datingDataMat,datingLabels=filesmatrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) inArr=array([ffMiles,percentTats,iceCreams]) classifierResult=classify0((inArr- minVals)/ranges,normMat,datingLabels,3)#ranges=maxvals-minvals print("you will probably like this person:", resultList[classifierResult-1])