K-mean聚类的一个代码的详细注释

#<span style = "font-size: 18px">
#  coding:utf-8
from numpy import *
import time
import matplotlib.pyplot as plt

# calculate Euclidean distance
def euclDistance(vector1, vector2):
    return sqrt(sum(power(vector1 - vector2, 2)))
    # 0ρ = sqrt( (x1-x2)^2+(y1-y2)^2 ) |x| = √( x2 + y2 )
    # power 对列表计算2次方  求和后开方

# init centroids with random samples 初始化质心随机样本
def initCentroids(dataSet, k):
    numSamples, dim = dataSet.shape
    #numSamples为dataSet的行数 dim为dataSet的列数
    centroids = zeros((k, dim))
    #centroids为k行dim列的零矩阵
    for i in range(k):
        index = int(random.uniform(0, numSamples))
        # uniform(x,y) 方法将随机生成下一个实数,它在[x,y]范围内。
        centroids[i, :] = dataSet[index, :]
    return centroids

# k-means cluser
def kmeans(dataSet, l):
    numSamples = dataSet.shape[0]
    # first colum stores which cluster this sample belongs to,
    # second colum stores the error between this sample and its centroid
    clusterAssment = mat(zeros((numSamples, 2)))
    '''
    zeros((numSamples, 2))生成数组
    mat(zeros((numSamples, 2)))将生成的数组转换为矩阵
    '''
    clusterChanged = True

    # step 1: init centroidsimport
    centroids = initCentroids(dataSet, k)

    while clusterChanged:
        clusterChanged = False
        ## for each sample
        for i in xrange(numSamples):
            #range()直接生成一个list对象,
            #xrange()生成一个生成器,由于每次调用只返回一个值
            #xrang()的执行效率要高于range()
            minDist = 100000.0
            minIndex = 0
            ## for each centroid
            ##step 2: find the centroid who is closest
            for j in range(k):
                distance = euclDistance(centroids[j, :], dataSet[i, :])
                if distance < minDist:
                    minDist = distance
                    minIndex = j

            ## step 3: update its cluster
            if clusterAssment[i, 0] != minIndex:
                clusterChanged = True
                clusterAssment[i, :] = minIndex, minDist**2

        ## step 4: update centroids
        for j in range(k):
            pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]
            #numpy.nonzero(a)[source]
            #Return the indices of the elements that are non-zero.
            #官网文档:http://docs.scipy.org/doc/numpy/reference/generated/numpy.nonzero.html
            centroids[j, :] = mean(pointsInCluster, axis = 0)

    print 'Congratulatons, cluster complete!'
    return centroids, clusterAssment

# show your cluster only available with 2-D data
def showCluster(dataSet, k, centroids, clusterAssment):
    numSamples, dim = dataSet.shape
    if dim != 2:
        print "Sorry! I can not draw because the dimension of your data is not 2!"
        return 1
    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    if k > len(mark):
        print "Sorry your k is too large! Please contact Zouxy"
        return 1

    # draw all samples
    for i in xrange(numSamples):
        markIndex = int(clusterAssment[i, 0])
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])

    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    # draw the centroids
    for i in range(k):
        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)
        plt.show()
    #plt.savefig('foo1.png')

#from numpy import *
#import time
#import matplotlib.pyplot as plt

## step 1: load data
print "step 1: load data..."
dataSet = []
fileIn = open('/home/amos/machine_learning')
for line in fileIn.readlines():
    lineArr = line.strip().split('\t')
    dataSet.append([float(lineArr[0]), float(lineArr[1])])

## step 2: clustering...
print "step 2: clustering..."
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kmeans(dataSet, k)

## step 3: show the result
print "step 3: show the result..."
showCluster(dataSet, k, centroids, clusterAssment)
#"setp 2:"可以通过直接调用官方库sklearn.cluster中的KMeans实现聚类
print "step 2: clustering..."
dataSet = mat(dataSet)
k = 4

############################################################
#by importing KMeans from sklearn.cluster
#call KMeans and get the same result as kmeans.py
kmeans = KMeans(n_clusters=k, random_state=0).fit(dataSet)
centroids = kmeans.cluster_centers_
clusterAssment = kmeans.labels_
############################################################

#"step 3:"中需要将showCluster(dataSet, k, centroids, clusterAssment)中的
# draw all samples部分的markIndex = int(clusterAssment[i, 0])
#改为markIndex = int(clusterAssment[i])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
本程序是在python中完成,基于sklearn.cluster中的k-means聚类包来实现数据的聚类,对于里面使用的数据格式如下:(注意更改程序中的相关参数) 138 0 124 1 127 2 129 3 119 4 127 5 124 6 120 7 123 8 147 9 188 10 212 11 229 12 240 13 240 14 241 15 240 16 242 17 174 18 130 19 132 20 119 21 48 22 37 23 49 0 42 1 34 2 26 3 20 4 21 5 23 6 13 7 19 8 18 9 36 10 25 11 20 12 19 13 19 14 5 15 29 16 22 17 13 18 46 19 15 20 8 21 33 22 41 23 69 0 56 1 49 2 40 3 52 4 62 5 54 6 32 7 38 8 44 9 55 10 70 11 74 12 105 13 107 14 56 15 55 16 65 17 100 18 195 19 136 20 87 21 64 22 77 23 61 0 53 1 47 2 33 3 34 4 28 5 41 6 40 7 38 8 33 9 26 10 31 11 31 12 13 13 17 14 17 15 25 16 17 17 17 18 14 19 16 20 17 21 29 22 44 23 37 0 32 1 34 2 26 3 23 4 25 5 25 6 27 7 30 8 25 9 17 10 12 11 12 12 12 13 7 14 6 15 6 16 12 17 12 18 39 19 34 20 32 21 34 22 35 23 33 0 57 1 81 2 77 3 68 4 61 5 60 6 56 7 67 8 102 9 89 10 62 11 57 12 57 13 64 14 62 15 69 16 81 17 77 18 64 19 62 20 79 21 75 22 57 23 73 0 88 1 75 2 70 3 77 4 73 5 72 6 76 7 76 8 74 9 98 10 90 11 90 12 85 13 79 14 79 15 88 16 88 17 81 18 84 19 89 20 79 21 68 22 55 23 63 0 62 1 58 2 58 3 56 4 60 5 56 6 56 7 58 8 56 9 65 10 61 11 60 12 60 13 61 14 65 15 55 16 56 17 61 18 64 19 69 20 83 21 87 22 84 23 41 0 35 1 38 2 45 3 44 4 49 5 55 6 47 7 47 8 29 9 14 10 12 11 4 12 10 13 9 14 7 15 7 16 11 17 12 18 14 19 22 20 29 21 23 22 33 23 34 0 38 1 38 2 37 3 37 4 34 5 24 6 47 7 70 8 41 9 6 10 23 11 4 12 15 13 3 14 28 15 17 16 31 17 39 18 42 19 54 20 47 21 68 22
c++实现k-mean算法#include <stdio.h> #include <stdlib.h> #include <string.h> #include <conio.h> #include <math.h> // FUNCTION PROTOTYPES // DEFINES #define SUCCESS 1 #define FAILURE 0 #define TRUE 1 #define FALSE 0 #define MAXVECTDIM 20 #define MAXPATTERN 20 #define MAXCLUSTER 10 char *f2a(double x, int width){ char cbuf[255]; char *cp; int i,k; int d,s; cp=fcvt(x,width,&d,&s); if (s) { strcpy(cbuf,"-"); } else { strcpy(cbuf," "); } /* endif */ if (d>0) { for (i=0; i<d; i++) { cbuf[i+1]=cp[i]; } /* endfor */ cbuf[d+1]=0; cp+=d; strcat(cbuf,"."); strcat(cbuf,cp); } else { if (d==0) { strcat(cbuf,"."); strcat(cbuf,cp); } else { k=-d; strcat(cbuf,"."); for (i=0; i<k; i++) { strcat(cbuf,"0"); } /* endfor */ strcat(cbuf,cp); } /* endif */ } /* endif */ cp=&cbuf[0]; return cp; } // ***** Defined structures & classes ***** struct aCluster { double Center[MAXVECTDIM]; int Member[MAXPATTERN]; //Index of Vectors belonging to this cluster int NumMembers; }; struct aVector { double Center[MAXVECTDIM]; int Size; }; class System { private: double Pattern[MAXPATTERN][MAXVECTDIM+1]; aCluster Cluster[MAXCLUSTER]; int NumPatterns; // Number of patterns int SizeVector; // Number of dimensions in vector int NumClusters; // Number of clusters void DistributeSamples(); // Step 2 of K-means algorithm int CalcNewClustCenters();// Step 3 of K-means algorithm double EucNorm(int, int); // Calc Euclidean norm vector int FindClosestCluster(int); //ret indx of clust closest to pattern //whose index is arg public: system(); int LoadPatterns(char *fname); // Get pattern data to be clustered void InitClusters(); // Step 1 of K-means algorithm void RunKMeans(); // Overall control K-means process void ShowClusters(); // Show results on screen void SaveClusters(char *fname); // Save results to file void ShowCenters(); }; void System::ShowCenters(){ int i; printf("Cluster centers:\n"); for (i=0; i<NumClusters; i++) { Cluster[i].Member[0]=i; printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[0],Cluster[i].Center[1]); } /* endfor */ printf("\n"); } int System::LoadPatterns(char *fname){ FILE *InFilePtr; int i,j; double x; if((InFilePtr = fopen(fname, "r")) == NULL) return FAILURE; fscanf(InFilePtr, "%d", &NumPatterns); // Read # of patterns fscanf(InFilePtr, "%d", &SizeVector); // Read dimension of vector fscanf(InFilePtr, "%d", &NumClusters); // Read # of clusters for K-Means for (i=0; i<NumPatterns; i++) { // For each vector for (j=0; j<SizeVector; j++) { // create a pattern fscanf(InFilePtr,"%lg",&x); // consisting of all elements Pattern[i][j]=x; } /* endfor */ } /* endfor */ printf("Input patterns:\n"); for (i=0; i<NumPatterns; i++) { printf("Pattern[%d]=(%2.3f,%2.3f)\n",i,Pattern[i][0],Pattern[i][1]); } /* endfor */ printf("\n--------------------\n"); return SUCCESS; } //*************************************************************************** // InitClusters * // Arbitrarily assign a vector to each of the K clusters * // We choose the first K vectors to do this * //*************************************************************************** void System::InitClusters(){ int i,j; printf("Initial cluster centers:\n"); for (i=0; i<NumClusters; i++) { Cluster[i].Member[0]=i; for (j=0; j<SizeVector; j++) { Cluster[i].Center[j]=Pattern[i][j]; } /* endfor */ } /* endfor */ for (i=0; i<NumClusters; i++) { printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[0],Cluster[i].Center[1]); } /* endfor */ printf("\n"); } void System::RunKMeans(){ int converged; int pass; pass=1; converged=FALSE; while (converged==FALSE) { printf("PASS=%d\n",pass++); DistributeSamples(); converged=CalcNewClustCenters(); ShowCenters(); } /* endwhile */ } double System::EucNorm(int p, int c){ // Calc Euclidean norm of vector difference double dist,x; // between pattern vector, p, and cluster int i; // center, c. char zout[128]; char znum[40]; char *pnum; pnum=&znum[0]; strcpy(zout,"d=sqrt("); printf("The distance from pattern %d to cluster %d is calculated as:\n",c,p); dist=0; for (i=0; i<SizeVector ;i++){ x=(Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]); strcat(zout,f2a(x,4)); if (i==0) strcat(zout,"+"); dist += (Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]); } /* endfor */ printf("%s)\n",zout); return dist; } int System::FindClosestCluster(int pat){ int i, ClustID; double MinDist, d; MinDist =9.9e+99; ClustID=-1; for (i=0; i<NumClusters; i++) { d=EucNorm(pat,i); printf("Distance from pattern %d to cluster %d is %f\n\n",pat,i,sqrt(d)); if (d<MinDist) { MinDist=d; ClustID=i; } /* endif */ } /* endfor */ if (ClustID<0) { printf("Aaargh"); exit(0); } /* endif */ return ClustID; } void System::DistributeSamples(){ int i,pat,Clustid,MemberIndex; //Clear membership list for all current clusters for (i=0; i<NumClusters;i++){ Cluster[i].NumMembers=0; } for (pat=0; pat<NumPatterns; pat++) { //Find cluster center to which the pattern is closest Clustid= FindClosestCluster(pat); printf("patern %d assigned to cluster %d\n\n",pat,Clustid); //post this pattern to the cluster MemberIndex=Cluster[Clustid].NumMembers; Cluster[Clustid].Member[MemberIndex]=pat; Cluster[Clustid].NumMembers++; } /* endfor */ } int System::CalcNewClustCenters(){ int ConvFlag,VectID,i,j,k; double tmp[MAXVECTDIM]; char xs[255]; char ys[255]; char nc1[20]; char nc2[20]; char *pnc1; char *pnc2; pnc1=&nc1[0]; pnc2=&nc2[0]; ConvFlag=TRUE; printf("The new cluster centers are now calculated as:\n"); for (i=0; i<NumClusters; i++) { //for each cluster pnc1=itoa(Cluster[i].NumMembers,nc1,10); pnc2=itoa(i,nc2,10); strcpy(xs,"Cluster Center"); strcat(xs,nc2); strcat(xs,"(1/"); strcpy(ys,"(1/"); strcat(xs,nc1); strcat(ys,nc1); strcat(xs,")("); strcat(ys,")("); for (j=0; j<SizeVector; j++) { // clear workspace tmp[j]=0.0; } /* endfor */ for (j=0; j<Cluster[i].NumMembers; j++) { //traverse member vectors VectID=Cluster[i].Member[j]; for (k=0; k<SizeVector; k++) { //traverse elements of vector tmp[k] += Pattern[VectID][k]; // add (member) pattern elmnt into temp if (k==0) { strcat(xs,f2a(Pattern[VectID][k],3)); } else { strcat(ys,f2a(Pattern[VectID][k],3)); } /* endif */ } /* endfor */ if(j<Cluster[i].NumMembers-1){ strcat(xs,"+"); strcat(ys,"+"); } else { strcat(xs,")"); strcat(ys,")"); } } /* endfor */ for (k=0; k<SizeVector; k++) { //traverse elements of vector tmp[k]=tmp[k]/Cluster[i].NumMembers; if (tmp[k] != Cluster[i].Center[k]) ConvFlag=FALSE; Cluster[i].Center[k]=tmp[k]; } /* endfor */ printf("%s,\n",xs); printf("%s\n",ys); } /* endfor */ return ConvFlag; } void System::ShowClusters(){ int cl; for (cl=0; cl<NumClusters; cl++) { printf("\nCLUSTER %d ==>[%f,%f]\n", cl,Cluster[cl].Center[0],Cluster[cl].Center[1]); } /* endfor */ } void System::SaveClusters(char *fname){ } void main(int argc, char *argv[]) { System kmeans; if (argc<2) { printf("USAGE: KMEANS PATTERN_FILE\n"); exit(0); } if (kmeans.LoadPatterns(argv[1])==FAILURE ){ printf("UNABLE TO READ PATTERN_FILE:%s\n",argv[1]); exit(0); } kmeans.InitClusters(); kmeans.RunKMeans(); kmeans.ShowClusters(); } 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值