# -*- coding: UTF-8 -*- from numpy import * import operator import matplotlib import matplotlib.pyplot as plt ''' k 临近算法: (1)计算已知类别数据集中的点与当前点之间的距离 (2)按照距离递增次序排序 (3)选取与当前点距离最小的k个点 (4)确定前k个点所在类别的出现频率 (5)返回前k个点出现频率最高的类别最为当前点的预测分类 ''' def classify(inX, dataSet, label, k): dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize,1)) - dataSet # tile(A,n)函数,A沿各维度重复n次数 sqDiffMat = diffMat**2 # **2 表示平方,**3表示立方 sqDistances = sqDiffMat.sum(axis=1) # axis =1表示每一行向量相加0表示每一列 distances = sqDistances**0.5 #argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y。 sortedDistIndicies = distances.argsort() classCount={} for i in range(k): voteIlabel = label[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] ''' 打开文件,转为对应的格式 ''' def file2matrix(filename): fr=open(filename) arrayLines=fr.readlines() #读取所有的内容,分析成行 numberOfLines = len(arrayLines) #n行 returnMat = zeros((numberOfLines,3)) # 3列n行初始化为0 classLabelVoctor = [] index = 0 for line in arrayLines: line = line.strip() #移除头尾空格 listFormLine = line.split('\t') #根据空格切割 returnMat[index,:]=listFormLine[0:3] # 行数,前3列 classLabelVoctor.append(int(listFormLine[-1])) # 标签存储 index+=1 return returnMat,classLabelVoctor
''' 归一化处理 ''' def autoNorm(dataSet): minVals = dataSet.min(0) # 每一列的最小值 maxVals = dataSet.max(0) ranges = maxVals- minVals normDataSet = zeros(shape(dataSet)) #shape(dataSet) 读取 长度、列数 m =dataSet.shape[0] # 长度(行数)=1000 normDataSet = dataSet - tile(minVals,(m,1)) # [ 0 0 0.00156] 重复1000行 normDataSet = normDataSet/(tile(ranges, (m, 1))) return normDataSet,ranges , minVals ''' # 判断错误率 ''' def datingClassTest(): hoRatio = 0.10 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') normMat,ranges , minVals = autoNorm(datingDataMat) #归一化处理 m = normMat.shape[0] numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): # 前numTestVecs个作为测试,后面的作为训练 classifierResult =classify(normMat[i,:],normMat[numTestVecs:m,:],\ datingLabels[numTestVecs:m],3) print ("the guess :%d ,the real answer is :%d" %(classifierResult ,datingLabels[i])) if (classifierResult != datingLabels[i] ): errorCount +=1.0 print ("the total error rate is : %f" %(errorCount/float(numTestVecs)))
#根据输入进行判断,并输出结果 def classifyPerson(): resultList =['not at all','in small doses','in large deses'] percentDats = float(input('percentage of time spent plating video games?')) ffMiles = float(input('frequent flier miled earned per year?')) iceCream = float(input('liters if ice cream consumed per year?')) datingDataMat ,datingLabels =file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) inArr =array([ffMiles,percentDats,iceCream]) # 封装 classifierResult = classify((inArr-minVals)/ranges,normMat,datingLabels,3) # 进行判断 print ("you will probably like this person :",resultList[classifierResult -1 ])
|