python中kmeans怎么导入数据集_通过Python实践K-means算法

最新推荐文章于 2023-02-16 20:26:52 发布

weixin_39592240

最新推荐文章于 2023-02-16 20:26:52 发布

阅读量584

点赞数

文章标签： python中kmeans怎么导入数据集

1 importnumpy as np2 importrandom3 from matplotlib importpyplot as plt4

5 classK_means(object):6 def __init__(self,X,k,maxIter):7 self.X = X#数据集是一个矩阵

8 self.k = k#所需要分的类的数

9 self.maxIter = maxIter#所允许的程序执行的最大的循环次数

11 defK_means(self):12 row,col = self.X.shape#得到矩阵的行和列

14 dataset = np.zeros((row,col + 1))#新生成一个矩阵，行数不变，列数加1 新的列用来存放分组号别矩阵中的初始值为0

15 dataset[:,:-1] =self.X16 print("begin:dataset:\n" +repr(dataset))17 #centerpoints = dataset[0:2,:]#取数据集中的前两个点为中心点

18 centerpoints = dataset[np.random.randint(row,size=k)]#采用随机函数任意取两个点

20 centerpoints[:,-1] = range(1,self.k+1)21 oldCenterpoints = None #用来在循环中存放上一次循环的中心点

22 iterations = 1 #当前循环次数

24 while notself.stop(oldCenterpoints,centerpoints,iterations):25 print("corrent iteration:" +str(iterations))26 print("centerpoint:\n" +repr(centerpoints))27 print("dataset:\n" +repr(dataset))28

29 oldCenterpoints = np.copy(centerpoints)#将本次循环的点拷贝一份记录下来

30 iterations += 1

32 self.updateLabel(dataset,centerpoints)#将本次聚类好的结果存放到矩阵中

34 centerpoints = self.getCenterpoint(dataset)#得到新的中心点，再次进行循环计算

36 np.save("kmeans.npy", dataset)37 returndataset38

39 defstop(self,oldCenterpoints,centerpoints,iterations):40 if iterations >self.maxIter:41 returnTrue42 return np.array_equal(oldCenterpoints,centerpoints)#返回两个点多对比结果

45 defupdateLabel(self,dataset,centerpoints):46 row,col =self.X.shape47 for i inrange(0,row):48 dataset[i,-1] = self.getLabel(dataset[i,:-1],centerpoints)49 #[i,j] 表示i行j列

51 #返回当前行和中心点之间的距离最短的中心点的类别，即当前点和那个中心点最近就被划分到哪一部分

52 defgetLabel(self,datasetRow,centerpoints):53 label = centerpoints[0, -1]#先取第一行的标签值赋值给该变量

54 minDist = np.linalg.norm(datasetRow-centerpoints[0, :-1])#计算两点之间的直线距离

55 for i in range(1, centerpoints.shape[0]):56 dist = np.linalg.norm(datasetRow-centerpoints[i, :-1])57 if dist < minDist:#当该变距离中心点的距离小于预设的最小值，那么将最小值进行更新

58 minDist =dist59 label = centerpoints[i,-1]60 print("minDist:" + str(minDist) + ",belong to label:" +str(label))61 returnlabel62

63 defgetCenterpoint(self,dataset):64 newCenterpoint = np.zeros((self.k,dataset.shape[1]))#生成一个新矩阵，行是k值，列是数据集的列的值

65 for i in range(1,self.k+1):66 oneCluster = dataset[dataset[:,-1] == i,:-1]#取出上一次分好的类别的所有属于同一类的点，对其求平均值

67 newCenterpoint[i-1, :-1] = np.mean(oneCluster,axis=0)#axis=0表示对行求平均值，=1表示对列求平均值

68 newCenterpoint[i-1, -1] = i#重新对新的中心点进行分类，初始类

70 returnnewCenterpoint71

72 #将散点图画出来

73 defdrawScatter(self):74 plt.xlabel("X")75 plt.ylabel("Y")76 dataset =self.K_means()77 x = dataset[:, 0] #第一列的数值为横坐标

78 y = dataset[:, 1] #第二列的数值为纵坐标

79 c = dataset[:, -1] #最后一列的数值用来区分颜色

80 color = ["none", "b", "r", "g", "y","m","c","k"]81 c_color =[]82

83 for i inc:84 c_color.append(color[int(i)])#给每一种类别的点都涂上不同颜色，便于观察