任务1 完整K-Means算法实现及聚类结果
补全完整的K-Means算法代码
在数据集data.csv上评价 K-Means 算法的聚类结果
绘出质心与所属样本欢迎使用Markdown编辑器
预期实验结果
- kMeans 聚类代码实现
import numpy as np
def distEuclidean(vecA, vecB):
return np.sqrt(np.square(vecA-vecB).sum())
def randCent(dataArr, k):
n = np.shape(dataArr)[1]
centroids = np.mat(zeros((k,n))) # 创建中心矩阵
for i in range(n): # 在数据范围内,为每个维度生成随机值
minJ = np.min(dataArr.values[:,i])
maxJ = np.max(dataArr.values[:,i])
rangeJ = float(maxJ - minJ)
np.random.seed(12345)
centroids[:,i] = minJ + rangeJ * random.rand(k,1)
return centroids
def kMeans(dataArr, k):
m = dataArr.shape[0] #获取样本数
clusterAssment = np.mat(zeros((m, 2))) # 创建存储簇分配结果的矩阵
clusterChanged = True # 簇分配结果改变标志
centroids = randCent(dataArr, k) #创建初s始质心
while clusterChanged:
clusterChanged = False
# 1. 遍历每个数据点,计算其与各簇中心的距离,以最近簇作为分配结果
for i in range(m):
minDist = float('inf');
minIndex = -1
for j in range(k):
distJI = distEuclidean(centroids[j],dataArr.iloc[[i]].values)
if distJI < minDist:
minDist = distJI;
minIndex = j
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex, minDist**2
# 2. 重新计算各簇的中心
for cent in range(k):
ptsInClust = dataArr[clusterAssment[:,0]==cent] #获取分到同一类的样本
centroids[cent,:] = np.mean(ptsInClust, axis=0) #求分配到同一类样本的均值
return centroids, clusterAssment
- 展示kMeans聚类结果
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # 导入seaborn包
sns.set_context("notebook",font_scale=1.3) # notebook格式,放大横纵坐标标记
sns.set_palette('Set2') # 配色使用Set2
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # retina高清显示
def plotResult(data, k):
np.random.seed(6)
centroids,clusterAssment = kMeans(data, k)
print("最终质心:")
print(centroids)
print("点所属簇的标签:")
print(clusterAssment[:, 0].T)
rect=[0.1, 0.1, 0.8, 0.8]
fig = plt.figure()
markers = ['s', 'o', '^', 'd']
ax = fig.add_axes(rect)
for cent in range(4): #提取出每个簇的数据
# 簇内数据矩阵和簇中心
ptsInCluster = data.values[np.nonzero(clusterAssment[:, 0].A==cent)[0]] #获得属于cent簇的数据
ax.scatter(ptsInCluster[:, 0],
ptsInCluster[:, 1],
marker=markers[cent],
s=45)
ax.scatter(centroids[:, 0].flatten().A[0],
centroids[:, 1].flatten().A[0],
marker='+',
s=100,
c='k')
- 加载数据,调用上述函数,评价聚类结果
data = pd.read_csv('data.csv')
plotResult(data, 4)
任务2 sklearn中的kMeans
调用sklearn中提供的kMeans算法
在数据集data.csv上评价 K-Means 算法的聚类结果
绘出质心与所属样本
对比分析与自己实验的kmeans聚类的实验结果的差异
预期实验结果
# 导入 sklearn中的KMeans工具包
import numpy as np
from sklearn.cluster import KMeans
#画图函数
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # 导入seaborn包
sns.set_context("notebook",font_scale=1.3) # notebook格式,放大横纵坐标标记
sns.set_palette('Set2') # 配色使用Set2
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # retina高清显示
def plotResult(data, k):
# 调用 sklearn 中提供的 KMeans 函数
kmeans_ = KMeans(n_clusters=k, init='random')
kmeans_.fit(data)
centroids = kmeans_.cluster_centers_
clusterAssment = kmeans_.labels_
print("最终质心:")
print(centroids)
print("点所属簇的标签:")
print(clusterAssment)
rect=[0.1, 0.1, 0.8, 0.8]
fig = plt.figure()
markers = ['s', 'o', '^', 'd']
ax = fig.add_axes(rect)
for cent in range(4):
# 簇内数据矩阵和簇中心
ptsInCluster = data.values[np.nonzero(clusterAssment==cent)[0]]
ax.scatter(ptsInCluster[:, 0],
ptsInCluster[:, 1],
marker=markers[cent],
s=45)
ax.scatter(centroids[:, 0].flatten(),
centroids[:, 1].flatten(),
marker='+',
s=100,
c='k')
加载数据,调用上述函数,评价聚类结果
# 加载数据
data = pd.read_csv('data.csv')
# 调用聚类函数,并展示聚类结果
plotResult(data, 4)