1.K均值的步骤
a.根据业务经验,尽量选取准确合理的质心
b.然后对数据,即质点进行相应的聚合
c.当质点进行聚合完毕后,或者达到指定的聚合标准阀值,算法结束
d.若算發聚合完成后认为完全分组或未达到指定阀值.重复进行a,b步骤.
2.K均值优缺点
优点:算法易于理解,实现,速度快,可并行化,如spark 对应实现
缺点:对质心进行硬性分析(非此即彼),无法进行后续的算法融合, 由于自身原因,容易产生误分的情况.
3.可视化
使用K均值对数据进行聚合分组,对应的分组后的数据matplotlib 进行标识,查看分组的标识是否正确,对比,以下也有实现
以下是实现
#-*-coding:utf-8-*- import KNN from numpy import * import operator import matplotlib import matplotlib.pyplot as plt #导入函数listdir 用来列出文件名 from os import listdir class KNN: def createDataSet(self): #构建数据集 group = array([[1.,1.1], [1.,1.], [0.,0.], [0.,0.1]]) #创建标签 labels = ['A','A','B','B'] return group,labels def classify(self,inx,dataSet,labels,k): dataSetSize = dataSet.shape[0] # (距离计算) 按照第二个参数进行复制扩充 diffMat = tile(inx,(dataSetSize,1)) sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() classCount={} #选择距离最小的k个点 for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] def file2Matrix(self,filename): #读入文件数据 fr = open(filename) #按行读取 arrayOnlines = fr.readlines() # 得到文本行数 numbberOfLines = len(arrayOnlines) #返回相应的矩阵, 生成以零填充的矩阵Numpy[二维数组] 将矩阵另一维度设置为3,可根据自己需求相应地增加代码以适应变化的输入值 returnMat = zeros((numbberOfLines,3)) #创建返回的numpy矩阵 (列表) classLabelVector=[] index = 0 #解析文件数据列表 for line in arrayOnlines: line = line.strip() #回车字符 #按行分割数据成集合 listFromLine = line.split("\t") #对标签进行数值转化 if(listFromLine[3].__eq__('largeDoses')): listFromLine[3]=3 elif(listFromLine[3].__eq__('smallDoses')): listFromLine[3] = 2 else:listFromLine[3] = 1 #提取前三个元素到列表 returnMat[index,:] = listFromLine[0:3] #利用负索引[索引值-1]表示列表中的最后一个元素,获取后放入标签向量,并且强制指定该元素的类型为Int,避免解释器解释为字符串 classLabelVector.append((int(listFromLine[-1]))) #负向索引 -1是最右边的第一个元素 index +=1 #返回转化后的矩阵,分类标签 return returnMat,classLabelVector #进行数据分布展示 def showMat(self,datingDataMat,labelVector): fig = plt.figure() plt.title("dating possiblity") ax = fig.add_subplot(111) #进行数据分散展示,未进行颜色填充 #ax.scatter(datingDataMat[:,1],datingDataMat[:,2]) #使用第二列字段,第三列字段进行数据分布描述,并对每类数据进行颜色区分 #ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(labelVector),15.0*array(labelVector)) #使用第一列字段,第二列字段进行数据分布描述,并对每类数据进行颜色区分 ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(labelVector),15.0*array(labelVector)) plt.show() #进行数据归一化操作 def autoNorm(self,dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) #进行归一化操作 ranges = maxVals - minVals #获取该数据集矩阵形式下的行数,列数 normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet-tile(minVals,(m,1)) #矩阵按某个数轴进行按数值复制 #特征相除 normDataSet = normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals #进行数据集的测试 def datingClassTest(self): hoRatio = 0.10 dateDataMat,dateLabels = self.file2Matrix("/home/jerry/IdeaProjects/RSEvaluate/data/dataSet") normMat,ranges,minvals = self.autoNorm(dateDataMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = self.classify(normMat[i,:],normMat[numTestVecs:m,],dateLabels[numTestVecs:m],3) print "the classifer came back with %d,the real answer is: %d"%(classifierResult,dateLabels[i]) if(classifierResult != dateLabels[i]): errorCount += 1.0 print "the total error rate is : %f"%(errorCount/float(numTestVecs)) def classifyPerson(self): resultList = ['not at all','in small doses','in large doses'] percentileTats = float(raw_input("percentage of time spent playing video games ?")) def classifyPerson(self): #结果哦集合 resultList = ['not at all','in small doeses','in large doeses'] #特征一: 花费在视频的时间占比 percentTats = float(raw_input("percentage of time spent playing video games?")) #特征二: 每年行车里程 ffMiles = float(raw_input("frequent filer miles earned per year?")) #每年消费冰激凌(升) iceCream = float(raw_input("liter of ice cream cosumed per year?")) datingDataMat,datingLabels = self.file2Matrix("/home/jerry/IdeaProjects/RSEvaluate/data/dataSet") normMat,ranges,minvals = self.autoNorm(datingDataMat) inArr = array([ffMiles,percentTats,iceCream]) classiferResult = self.classify((inArr-minvals)/ranges,normMat,datingLabels,3) print "You will probably like this person: ",resultList[classiferResult-1] def img2Vector(self,filename): returnVector = zeros((1,1024)) #构建 1X1024的向量 fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVector[0,32*i+j] = int(lineStr[j]) return returnVector def handwritingClassTest(self): hwLabels = [] trainingFileList = listdir("/home/jerry/IdeaProjects/RSEvaluate/data/trainingDigits") #获取目录文件列表 #目录内容 长度 m = len(trainingFileList) #生成mX1024的向量 trainingMat = zeros((m,1024)) for i in range(m): #以下三行从文件名解析分类数字 #获取文件列表中文件名称 filenameStr = trainingFileList[i] #以"." 分割字符串,并获取第一个元素 获取文件名称 filestr = filenameStr.split(".")[0] #获取文件名称中的第一个字符(以"_"分割 classNumstr = int(filestr.split("_")[0]) hwLabels.append(classNumstr) trainingMat[i,:] = self.img2Vector("/home/jerry/IdeaProjects/RSEvaluate/data/trainingDigits/%s"%filenameStr) testFileList = listdir("/home/jerry/IdeaProjects/RSEvaluate/data/testDigits/") errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): filenameStr = testFileList[i] filestr = filenameStr.split(".")[0] #获取文件名称中的对应数字 ??????是否获取0,还是1 (因为文件名是-0_13.txt,按照"_"分割后 [0]是0 [1] 13) classNumstr = int(filestr.split("_")[1]) vectorUnderTest = self.img2Vector("/home/jerry/IdeaProjects/RSEvaluate/data/testDigits/%s"%filenameStr) classifierResult = self.classify(vectorUnderTest,trainingMat,hwLabels,3) #print "the classifier came back with: %s","the real answer is: %s" %(classifierResult,classNumstr) print "\n the total number of error is: %d"%errorCount print "\nthe total error rate is: %f"%(errorCount/float(mTest)) if __name__ == '__main__': #实例化 ins = KNN() #调用方法 group,labels = ins.createDataSet() #print group,labels res = ins.classify([0,0],group,labels,3) #for i in res: # print ("分类的结果 " +i) retmat,vector = ins.file2Matrix("/home/jerry/IdeaProjects/RSEvaluate/data/dataSet") #print retmat #print "\n" #print vector[0:20] #使用matplotlib 进行可视化展示 #ins.showMat(retmat,vector) #对数据字段进行归一化处理,使每个字段的重要性都相同(使其值处于 [0,1])之间 normMat,ranges,minvals = ins.autoNorm(retmat) #print "\n 归一化后的值\n" #print normMat #for i in retmat: # print i #print "\n i'm here\n" #print ranges #print minvals #ins.datingClassTest() #ins.classifyPerson() #进行图像识别 #testVector = ins.img2Vector("/home/jerry/IdeaProjects/RSEvaluate/data/0_13.txt") #print testVector[0,0:31],"\n 另一范围","\n",testVector[0,32:63] #进行图片数字识别 ins.handwritingClassTest()