一、预设
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt #绘图
from mpl_toolkits.mplot3d import Axes3D #可视化
import pandas as pd
from itertools import chain
plt.rcParams['font.sans-serif'] = ['SimHei']
from sklearn import metrics
iris = load_iris()
X = iris.data #加载特征数据
二、探索数据分布
#绘制数据分布图
plt.figure() # 创建图像
plt.scatter(X[:, 0], X[:, 1], c="red", marker='o', label='datas') # 画散点图
plt.xlabel('Sepal.Length') # 设置X轴的标签为datas
plt.ylabel('Sepal.Width')
plt.legend(loc=1) # 设置图标
<matplotlib.legend.Legend at 0x1b7e5f8d850>
三、K-means
import random
import pandas as pd
import numpy as np
from itertools import chain
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
# 计算欧拉距离
def calcDis(dataSet, centroids, k):
clalist = []
for data in dataSet:
diff = np.tile(data, (k, 1)) - centroids # 分别与三个质心相减
squaredDiff = diff ** 2
squaredDist = np.sum(squaredDiff, axis = 1) # (x1-x2)^2 + (y1-y2)^2
distance = squaredDist ** 0.5
clalist.append(distance)
clalist = np.array(clalist)
return clalist
# 计算质心
def classify(dataSet, centroids, k):
# 计算样本到质心的距离
clalist = calcDis(dataSet, centroids, k)
# 分组并计算新的质
minDistIndices = np.argmin(clalist, axis=1)
newCentroids = (pd.DataFrame(dataSet).groupby(minDistIndices)).mean()
newCentroids = newCentroids.values
# 计算变化量
change = newCentroids - centroids
return change, newCentroids
# 使用K-means聚类
def kmeans(dataSet, k):
#随机选取质心
centroids = random.sample(list(dataSet[:,:2]), k)
# 更新质心,直到变化量全为0
change, newCentroids = classify(dataSet[:,:2], centroids, k)
while np.any(change != 0):
change, newCentroids = classify(dataSet[:,:2], newCentroids, k)
centroids = sorted(newCentroids.tolist())
# 根据质心计算每个集群
cluster = []
curr = 0
clalist = calcDis(dataSet[:,:2],centroids,k) # 计算每个点分别到三个质心的距离
minDistIndices = np.argmin(clalist,axis = 1) # 选取距离每个质心距离最小的点返回下标
for i in range(len(minDistIndices)):
if minDistIndices[i] == dataSet[i:i+1,2]:
curr += 1
for i in range(k):
cluster.append([])
for i, j in enumerate(minDistIndices): # 分类
cluster[j].append(dataSet[i:i+1,:2])
return centroids,cluster,minDistIndices,curr
# 可视化结果
def Draw(centroids,cluster):
cluster0 = list(chain.from_iterable(cluster[0]))# 类别1
cluster0 = np.array(cluster0)
cluster1 = list(chain.from_iterable(cluster[1]))# 类别2
cluster1 = np.array(cluster1)
cluster2 = list(chain.from_iterable(cluster[2]))# 类别3
cluster2 = np.array(cluster2)
ax = plt.subplot()
ax.scatter(cluster0[:,0],cluster0[:,1],marker = '*',c = 'green',label = 'Iris-setosa')
ax.scatter(cluster1[:,0],cluster1[:,1],marker = 'o',c = 'blue',label = 'Iris-versicolor')
ax.scatter(cluster2[:,0],cluster2[:,1],marker = '+',c = 'orange',label = 'Iris-virginica')
ax.scatter(centroids[:,0],centroids[:,1],marker='x',c = 'red',label = 'centroid')
plt.title("k-means clustering results")
plt.xlabel("Sepal.Length")
plt.ylabel("Petal.Length")
plt.legend()
plt.show()
def eva_index(dataSet):
# 存放不同的SSE值
sse_list = []
# 轮廓系数
silhouettes = []
for k in range(2,9): # k取值范围
sum = 0
centroids, cluster, minDistance, curr = kmeans(dataset, k)
minDistance = np.array(minDistance)
centroids = np.array(centroids)
# 计算误方差值
for i in range(len(cluster)):
temp = np.sum((cluster[i] - centroids[i])**2)
sum += temp
sse_list.append(sum)
# 计算轮廓系数
silhouette = metrics.silhouette_score(dataSet[:,:2],minDistance,metric='euclidean')
silhouettes.append(silhouette)
# 绘制簇内误方差曲线
plt.subplot(211)
plt.title('KMeans Intra-cluster error variance')
plt.plot(range(2, 9), sse_list, marker='*')
plt.xlabel('Number of clusters')
plt.ylabel('Intra-cluster error variance(SSE)')
plt.show()
# 绘制轮廓系数曲线
plt.subplot(212)
plt.title('KMeans Profile coefficient curve')
plt.plot(range(2,9), silhouettes, marker = '+')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette coefficient')
plt.show()
if __name__=='__main__':
dataset = X # 加载数据
eva_index(dataset) # kmeans评价指标
centroids, cluster,minDistance, curr = kmeans(dataset, 3) # k=3时进行聚类
centroids = np.array(centroids)
Draw(centroids,cluster) # 结果可视化
四、高斯混合聚类
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
# 计算欧拉距离
def calcDis(dataSet, centroids, k):
clalist = []
for data in dataSet:
diff = np.tile(data, (k, 1)) - centroids # 分别与三个质心相减
squaredDiff = diff ** 2
squaredDist = np.sum(squaredDiff, axis = 1) # (x1-x2)^2 + (y1-y2)^2
distance = squaredDist ** 0.5
clalist.append(distance)
clalist = np.array(clalist)
return clalist
# 计算质心
def classify(dataSet, centroids, k):
# 计算样本到质心的距离
clalist = calcDis(dataSet, centroids, k)
# 分组并计算新的质心
minDistIndices = np.argmin(clalist, axis=1)
newCentroids = pd.DataFrame(dataSet).groupby(minDistIndices).mean()
newCentroids = newCentroids.values
# 计算变化量
change = newCentroids - centroids
return change, newCentroids
# 使用K-means分类
def kmeans(dataSet, k):
#随机选取质心
centroids = random.sample(list(dataSet), k)
# 更新质心,直到变化量全为0
change, newCentroids = classify(dataSet, centroids, k)
while np.any(change != 0):
change, newCentroids = classify(dataSet, newCentroids, k)
centroids = sorted(newCentroids.tolist())
return centroids
# 高斯分布的概率密度函数
def prob(x, mu, sigma):
n = np.shape(x)[1]
expOn = float(-0.5 * (x - mu) * (np.linalg.inv(sigma)) * ((x - mu).T))
divBy = pow(2 * np.pi, n / 2) * pow(np.linalg.det(sigma), 0.5) # np.linalg.det 计算矩阵的行列式
return pow(np.e, expOn) / divBy
# EM算法
def EM(dataMat,centroids, maxIter,c):
m, n = np.shape(dataMat) # m=150, n=2
# 初始化参数
alpha = [1 / 3, 1 / 3, 1 / 3] # 系数
mu = np.mat(centroids) # 均值向量
# mu = random.sample(list(dataMat), c)
sigma = [np.mat([[0.1, 0], [0, 0.1]]) for x in range(3)] # 初始化协方差矩阵Σ
gamma = np.mat(np.zeros((m, c))) # γ(ik)
for i in range(maxIter):
for j in range(m):
sumAlphaMulP = 0
for k in range(c):
gamma[j, k] = alpha[k] * prob(dataMat[j, :], mu[k], sigma[k])
sumAlphaMulP += gamma[j, k]
for k in range(c):
gamma[j, k] /= sumAlphaMulP # 计算后验分布
sumGamma = np.sum(gamma, axis=0)
for k in range(c):
mu[k] = np.mat(np.zeros((1, n)))
sigma[k] = np.mat(np.zeros((n, n)))
for j in range(m):
mu[k] += gamma[j, k] * dataMat[j, :]
mu[k] /= sumGamma[0, k] # 更新均指向量
for j in range(m):
sigma[k] += gamma[j, k] * (dataMat[j, :] - mu[k]).T *(dataMat[j, :] - mu[k])
sigma[k] /= sumGamma[0, k] # 更新协方差矩阵
alpha[k] = sumGamma[0, k] / m # 更新混合系数
return gamma
# 高斯混合聚类
def gaussianCluster(dataMat,centroids,k):
m, n = np.shape(dataMat)
clusterAssign = np.mat(np.zeros((m, n)))
gamma = EM(dataMat,centroids,5,3)
centroids = np.array(centroids)
for i in range(m):
clusterAssign[i, :] = np.argmax(gamma[i, :]), np.amax(gamma[i, :])
for j in range(k):
pointsInCluster = dataMat[np.nonzero(clusterAssign[:, 0].A == j)[0]] # 将数据分类
centroids[j, :] = np.mean(pointsInCluster, axis=0) # 确定各均值中心,获得分类模型
return centroids, clusterAssign
def showCluster(dataMat, k, centroids, clusterAssment):
numSamples, dim = dataMat.shape
mark = ['o', '+', '*']
color = ['green','blue','orange']
for i in range(numSamples):
Index = int(clusterAssment[i, 0])
plt.plot(dataMat[i, 0], dataMat[i, 1],mark[Index],c = color[Index],markersize = 7)
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], marker='x',c = 'red',label = 'centroid',markersize = 10)
plt.title("GMM clustering results")
plt.xlabel("Sepal.Length")
plt.ylabel("Petal.Length")
plt.show()
if __name__=="__main__": # 导入数据
dataMat = np.mat(X[:,0:2])
centroids = kmeans(X[:,0:2], 3) # 使用K_means算法初始化质心
centroids, clusterAssign = gaussianCluster(dataMat,centroids,3) # GMM算法
curr = 0
showCluster(dataMat, 3, centroids, clusterAssign) # 可视化结果
from sklearn.mixture import GaussianMixture
clf=GaussianMixture(n_components=3)
clf.fit(X)
result=clf.predict(X)
plt.scatter(X[:,0],X[:,1],c=result)
silhouette = metrics.silhouette_score(X[:,:2],result,metric='euclidean')
kkk1 = metrics.calinski_harabasz_score(X[:,:2],result)
plt.xlabel("Sepal.Length")
plt.ylabel("Sepal.Wideth")
Text(0, 0.5, 'Sepal.Wideth')
五、高斯混合聚类的评价
silhouettes =[]
kkk1 = []
for k in range(2,9):
clf=GaussianMixture(n_components=k)
clf.fit(X)
result=clf.predict(X)
plt.scatter(X[:,0],X[:,1],c=result)
silhouette = metrics.silhouette_score(X[:,:2],result,metric='euclidean')
kkk = metrics.calinski_harabasz_score(X[:,:2],result)
silhouettes.append(silhouette)
kkk1.append(kkk)
plt.subplot(212)
plt.title('高斯混合聚类 SSE系数曲线')
plt.plot(range(2,9), s, marker = '+')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette coefficient')
plt.show()
# 绘制轮廓系数曲线
plt.subplot(212)
plt.title('高斯混合聚类 轮廓系数曲线')
plt.plot(range(2,9), silhouettes, marker = '+')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette coefficient')
plt.show()
# 绘制calinski_harabasz系数曲线
plt.subplot(212)
plt.title('高斯混合聚类 calinski_harabasz曲线')
plt.plot(range(2,9), kkk1, marker = '+')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette coefficient')
plt.show()
六、DBSCAN
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.6,min_samples=3).fit_predict(X)
plt.scatter(X[:,0],X[:,1],c=db)
plt.xlabel("Sepal.Length")
plt.ylabel("Sepal.Wideth")
Text(0, 0.5, 'Sepal.Wideth')
db
array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, -1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
dtype=int64)
七、DBSCAN的评价
silhouettes =[]
kkk1 = []
for k in range(2,9):
db = DBSCAN(eps=0.6,min_samples=k).fit_predict(X)
plt.scatter(X[:,0],X[:,1],c=db)
silhouette = metrics.silhouette_score(X[:,:2],db,metric='euclidean')
kkk = metrics.calinski_harabasz_score(X[:,:2],db)
silhouettes.append(silhouette)
kkk1.append(kkk)
plt.subplot(212)
plt.title('DBSCAN SSE系数曲线')
plt.plot(range(2,9), s, marker = '+')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette coefficient')
plt.show()
# 绘制轮廓系数曲线
plt.subplot(212)
plt.title('DBSCAN 轮廓系数曲线')
plt.plot(range(2,9), silhouettes, marker = '+')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette coefficient')
plt.show()
# 绘制calinski_harabasz系数曲线
plt.subplot(212)
plt.title('DBSCAN calinski_harabasz曲线')
plt.plot(range(2,9), kkk1, marker = '+')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette coefficient')
plt.show()