文章目录
K-Means-Clustering
-
数据集:链接:https://pan.baidu.com/s/1aEgDpQoeT_kFbwD1YyrCaw
提取码:2021 -
代码:
#!/usr/bin/env python # coding: utf-8 # In[6]: ## import Module import numpy as np import matplotlib.pyplot as plt # In[5]: ## data analysis data = np.loadtxt('数据集//testSet.txt') data.shape #(80, 2) # In[7]: ## 数据可视化 plt.scatter(data[:,0],data[:,1]) plt.show() # In[54]: # 初始化K个质心 def initCentroids(data, k): shape = data[0].shape[0] centroids = np.empty((k,shape)) for i in range(shape): centroids[:,i] = (data[:,i].min() + data[:,i].ptp() * np.random.rand(k,1)).flatten() # 转换成一维的 return centroids # In[118]: initCentroids(data,4) # In[119]: ## 计算质心 和样本距离的函数 def euclDistance(v1,v2): return np.sqrt(np.sum(np.power(v2-v1,2))) # In[147]: ## algorithm core def KMeans(data, k): row = data.shape[0] centroids = initCentroids(data,k) #存放 K 个簇 while(True): flag = True lists = [[] for i in range(k)] for i in range(row): ds = 1000000 cs = 0 for j in range(k): d = euclDistance(data[i],centroids[j]) if d<ds: ds = d cs = j #找到 当前样本距离最近的质心,放到 对应集合中 lists[cs].append(data[i]) newLists = np.array([np.mean(lists[i], axis = 0) for i in range(k)]) if np.array_equal(newLists, centroids) == True: break else: centroids = newLists #lists 是 划分好的 聚簇,newLists 是 质点集合 return lists,newLists # In[152]: ## 调用函数 lists,newLists = KMeans(data,k = 4) newLists # In[153]: ## 在原图中画出 最终质点 plt.scatter(data[:,0],data[:,1]) plt.scatter(newLists[:,0],newLists[:,1]) plt.show()