虽然把text转成全部量化是可以的,但是还是需要把text转成numpy的形式(这个是必须掌握的)
在将数据输入到分类器之前,必须将待处理数据的格式改变为分类器可以接受的格式。
数据规范化、数据归一化、数据算法化、输出误差分析
代码:
# -*- coding:utf-8 -*- from numpy import * def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines,3)) #prepare matrix to return classLabelVector = [] #prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat,classLabelVector #结果全部量化,把喜欢不喜欢排名1、2、3 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') import matplotlib import matplotlib.pyplot as plt # matplotlib 是python最著名的绘图库,它提供了一整套和matlab相似的命令API,十分适合交互式地行制图。而且也可以方便地将它作为绘图控件,嵌入GUI应用程序中。 fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels)) plt.show()
def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) #创建新的返回矩阵 m = dataSet.shape[0] #得到数据集的行数 shape方法用来得到矩阵或数组的维数 normDataSet = dataSet - tile(minVals,(m,1)) #tile:numpy中的函数。tile将原来的一个数组minVals,扩充成了m行1列的数组 normDataSet = normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals normMat,ranges,minVals = autoNorm((datingDataMat)) import operator def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def datingClassTest(): hoRatio = 0.10 ErrorCount = 0.0 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] count = int(m*hoRatio) #这里需要整型化 for i in range(count): #算法里使用的数据是count(总数)还是i(当前数), #逐渐被测试的数据inX使用[i,:],但是数据集使用count # 输入参数:normMat[i,:]为测试样例,表示归一化后的第i行数据 # normMat[numTestVecs:m,:]为训练样本数据,样本数量为(m-numTestVecs)个 # datingLabels[numTestVecs:m]为训练样本对应的类型标签 # k为k-近邻的取值 classifierResult = classify0(normMat[i,:],normMat[count:m,:],datingLabels[count:m],4) print "the classifier came back with:%d,the real answer is :%d"\ % (classifierResult,datingLabels[i]) if (classifierResult != datingLabels[i]) : ErrorCount += 1.0 print "the total error rate is :%f" % (ErrorCount/float(count)) def classifyPerson(): resultList = ['not at all','in small doses','in large doses'] #float定义了输入的类型 percentTats = float(raw_input( "percentage of time spent playing video games?")) ffMiles = float(raw_input("frequent flier miles earned per year?")) iceCream = float(raw_input("liters of ice cream consumed per year?")) datingDataMat,datingLabels = file2matrix(("datingTestSet2.txt")) normMat,ranges,minVals = autoNorm(datingDataMat) #将输入的数据数组化 inArr = array([ffMiles,percentTats,iceCream]) classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3) print "You will probably like this person:",resultList[classifierResult - 1]