本篇主要记录一下几种常用的聚类算法
使用的参考代码和数据集还是(一)里面的
1.K-Means
算法思想大致为:先从样本集中随机选取 k个样本作为簇中心,并计算所有样本与这 k个“簇中心”的距离,对于每一个样本,将其划分到与其距离最近的“簇中心”所在的簇中,对于新的簇计算各个簇的新的“簇中心”。
根据以上描述,我们大致可以猜测到实现kmeans算法的主要四点:
(1)簇个数 k 的选择
(2)各个样本点到“簇中心”的距离
(3)根据新划分的簇,更新“簇中心”
(4)重复上述2、3过程,直至"簇中心"没有移动
优点:容易实现
缺点:可能收敛到局部最小值,在大规模数据上收敛较慢
详细原理参考:https://blog.csdn.net/qq_43741312/article/details/97128745
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
# 加载数据
def loadDataSet(fileName):
data = pd.read_csv(fileName)
data = data.to_numpy().T
return data
# 欧氏距离计算
def distEclud(x, y):
return np.sqrt(np.sum((x - y) ** 2)) # 计算欧氏距离
# 为给定数据集构建一个包含K个随机质心的集合
def randCent(dataSet, k):
m, n = dataSet.shape
centroids = np.zeros((k, n))
for i in range(k):
index = int(np.random.uniform(0, m))
centroids[i, :] = dataSet[index, :]
return centroids
# k均值聚类
def KMeans(dataSet, k):
m = np.shape(dataSet)[0] # 行的数目
# 第一列存样本属于哪一簇
# 第二列存样本的到簇的中心点的误差
clusterAssment = np.mat(np.zeros((m, 2)))
clusterChange = True
# 第1步 初始化centroids
centroids = randCent(dataSet, k)
while clusterChange:
clusterChange = False
# 遍历所有的样本(行数)
for i in range(m):
minDist = 100000.0
minIndex = -1
# 遍历所有的质心
# 第2步 找出最近的质心
for j in range(k):
# 计算该样本到质心的欧式距离
distance = distEclud(centroids[j, :], dataSet[i, :])
if distance < minDist:
minDist = distance
minIndex = j
# 第 3 步:更新每一行样本所属的簇
if clusterAssment[i, 0] != minIndex:
clusterChange = True
clusterAssment[i, :] = minIndex, minDist ** 2
# 第 4 步:更新质心
for j in range(k):
# nonzero函数是numpy中用于得到数组array中非零元素的位置(数组索引)的函数
# .A:返回自身数据的二维数组的一个视图(没有做任何的拷贝)
pointsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0]] # 获取簇类所有的点
centroids[j, :] = np.mean(pointsInCluster, axis=0) # 对矩阵的行求均值
return centroids, clusterAssment
def showCluster(dataSet, k, centroids, clusterAssment):
m, n = dataSet.shape
if n != 2:
print("数据不是二维的")
return 1
defalut_color = ['#95d0fc', '#96f97b', '#c79fef',
'#ff81c0','#00035b', '#06c2ac',
'#ffff14', '#033500', '#c20078']
if k > len(defalut_color):
print("k值太大了")
return 1
# 绘制所有的样本
for i in range(m):
colorIndex = int(clusterAssment[i, 0])
plt.scatter(dataSet[i, 0], dataSet[i, 1], color=defalut_color[colorIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
# 绘制质心
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i])
plt.show()
if __name__ == "__main__":
data = loadDataSet("dataset/darmanis.csv")
matrix = data[1:, 2:].astype(float)
# 将数据缩放至[0,1]
scaler = MinMaxScaler()
matrix_scaler = scaler.fit_transform(matrix)
# t-sne降至2维
tsne = TSNE(n_components=2, perplexity=20)
matrix_tsne = tsne.fit_transform(matrix_scaler)
print(matrix_tsne.shape)
k = 9
centroids, clusterAssment = KMeans(matrix_tsne, k)
print(centroids)
showCluster(matrix_tsne, k, centroids, clusterAssment)
聚类效果
2.谱聚类
谱聚类是一种基于图论的聚类方法,通过对样本数据的拉普拉斯矩阵的特征向量进行聚类,从而达到对样本数据聚类的目的。谱聚类可以理解为将高维空间的数据映射到低维,然后在低维空间用其它聚类算法(如KMeans)进行聚类。
算法流程:
使用k近邻算法构建邻接矩阵
from sklearn.cluster import SpectralClustering
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from K_Means import KMeans
from sklearn.metrics import adjusted_rand_score
'''
使用库函数实现谱聚类
dataset = pd.read_csv('dataset/darmanis.csv')
dataset = dataset.to_numpy().T
# print(dataset)
data = dataset[1:, 2:].astype(float)
# print(data.shape)
labels = dataset[:, 1]
# print(labels)
dim_data = TSNE(n_components=2).fit_transform(data)
y_pred = SpectralClustering(n_clusters=9, affinity='nearest_neighbors').fit_predict(dim_data)
print(y_pred.shape)
colors = get_color(y_pred)
draw_scatter(dim_data[:, 0], dim_data[:, 1], y_pred, colors)
'''
'''
谱聚类原理:
1.计算距离矩阵(例如欧氏距离)
2.利用KNN计算邻接矩阵A
3.由A计算度矩阵D和拉普拉斯矩阵L
4.标准化 L→D^(-1/2) L D^(-1/2)
5.对矩阵 D^(-1/2) L D^(-1/2)进行特征值分解,得到特征向量H
6.将H当成样本送入Kmeans聚类,获得聚类结果C=(C1,C2,⋯,Ck)
'''
# loading dataset
def loadData(filename):
data = pd.read_csv(filename).to_numpy()
data = data.T
return data
# normalization
def normalization(data):
maxcols = data.max(axis=0)
mincols = data.min(axis=0)
data_shape = data.shape
data_rows = data_shape[0]
data_cols = data_shape[1]
nor_data = np.empty((data_rows, data_cols))
for i in range(data_cols):
nor_data[:, i] = (data[:, i] - mincols[i]) / (maxcols[i] - mincols[i])
return nor_data
# 计算距离矩阵
def distance(x1, x2, sqrt_flag=False):
res = np.sum((x1 - x2) ** 2)
if sqrt_flag:
res = np.sqrt(res)
return res
def cal_distancematrxi(X):
X = np.array(X)
S = np.zeros((len(X), len(X)))
for i in range(len(X)):
for j in range(i + 1, len(X)):
S[i][j] = distance(X[i], X[j])
S[j][i] = S[i][j]
return S
# 计算邻接矩阵
def myKNN(S, k, sigma=1.0):
N = len(S)
A = np.zeros((N, N))
for i in range(N):
dist_with_index = zip(S[i], range(N))
dist_with_index = sorted(dist_with_index, key=lambda x: x[0])
neighbours_id = [dist_with_index[m][1] for m in range(k + 1)] # xi's k nearest neighbours
for j in neighbours_id: # xj is xi's neighbour
A[i][j] = np.exp(-S[i][j] / 2 / sigma / sigma)
A[j][i] = A[i][j] # mutually
return A
# Laplacian
def calLaplacianMatrix(adjacentMatrix):
# compute the Degree Matrix: D=sum(A)
degreeMatrix = np.sum(adjacentMatrix, axis=1)
# compute the Laplacian Matrix: L=D-A
laplacianMatrix = np.diag(degreeMatrix) - adjacentMatrix
# normailze
# D^(-1/2) L D^(-1/2)
sqrtDegreeMatrix = np.diag(1.0 / (degreeMatrix ** (0.5)))
return np.dot(np.dot(sqrtDegreeMatrix, laplacianMatrix), sqrtDegreeMatrix)
if __name__ == '__main__':
dataset = loadData('dataset/darmanis.csv')
data = dataset[1:, 2:].astype(np.float32)
data = normalization(data)
n_cells, n_genes = data.shape
true_labels = dataset[1:, 1].astype(int).flatten()
print(true_labels)
print(true_labels.shape)
# Similar matrix
Similar_matrix = cal_distancematrxi(data)
# Adjacent matrix
Adjacent = myKNN(Similar_matrix, k=9)
# Laplacian matrix
Laplacian = calLaplacianMatrix(Adjacent)
# 计算特征值和特征向量
eigvalues, eigvector = np.linalg.eig(Laplacian)
# 对特征值排序
eigvalues = zip(eigvalues, range(len(eigvalues)))
eigvalues = sorted(eigvalues, key=lambda eigvalues: eigvalues[0])
# 得到特征值对应的特征向量矩阵
eig_matrix = np.vstack([eigvector[:, i] for (v, i) in eigvalues[:]]).T
eig_matrix = np.array(eig_matrix)
print(eig_matrix.shape)
print(data.shape)
# 降至2维
dim_data = TSNE(n_components=2).fit_transform(data)
print(dim_data.shape)
dim_eig = TSNE(n_components=2).fit_transform(eig_matrix)
print(dim_eig.shape)
defalut_color = ['#FF0000', '#FF1493', '#9400D3', '#7B68EE', '#FFD700',
'#00BFFF', '#00FF00', '#FF8C00', '#FF4500', '#8B4513',
'#1F77B4', '#FF7F0E', '#2CA02C', '#D62728']
# 谱聚类使用kmeans效果
_, clusterAssment1 = KMeans(dim_eig, 9)
eig_labels = np.zeros((n_cells, 1))
plt.subplot(1, 2, 1)
plt.title('Spectral')
for i in range(dim_eig.shape[0]):
colorIndex = int(clusterAssment1[i, 0])
eig_labels[i] = clusterAssment1[i, 0]
plt.scatter(dim_eig[i, 0], dim_eig[i, 1], color=defalut_color[colorIndex])
eig_labels = eig_labels.astype(int).flatten()
print(eig_labels)
print(adjusted_rand_score(eig_labels, true_labels))
# 原始数据使用kmeans效果
_, clusterAssment2 = KMeans(dim_data, 9)
data_pre = np.zeros((n_cells, 1))
plt.subplot(1, 2, 2)
plt.title('K-means')
for i in range(dim_data.shape[0]):
colorIndex = int(clusterAssment2[i, 0])
data_pre[i] = clusterAssment2[i, 0]
plt.scatter(dim_data[i, 0], dim_data[i, 1], color=defalut_color[colorIndex])
data_pre = data_pre.astype(int).flatten()
print(adjusted_rand_score(data_pre, true_labels))
plt.show()
聚类效果
优点
(1)当聚类的类别个数较小的时候,谱聚类的效果会很好,但是当聚类的类别个数较大的时候,则不建议使用谱聚类;
(2)谱聚类算法使用了降维的技术,所以更加适用于高维数据的聚类;
(3)谱聚类只需要数据之间的相似度矩阵,因此对于处理稀疏数据的聚类很有效。这点传统聚类算法(比如K-Means)很难做到
(4)谱聚类算法建立在谱图理论基础上,与传统的聚类算法相比,它具有能在任意形状的样本空间上聚类且收敛于全局最优解
缺点
(1)谱聚类对相似度图的改变和聚类参数的选择非常的敏感;
(2)谱聚类适用于均衡分类问题,即各簇之间点的个数相差不大,对于簇之间点个数相差悬殊的聚类问题,谱聚类则不适用;
详细原理参考:https://blog.csdn.net/qq_24519677/article/details/82291867
3.GMM(高斯混合分布模型)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
'''
GMM算法流程:
1.初始化k个分布函数的参数w,μ,sigma
2.按照不同分布函数计算的概率将数据“分簇”
3.根据最大似然原则对参数进行更新
'''
# loading dataset
def loadData(filename):
data = pd.read_csv(filename).to_numpy()
data = data.T
return data
# normalization
def normalization(data):
maxcols = data.max(axis=0)
mincols = data.min(axis=0)
data_shape = data.shape
data_rows = data_shape[0]
data_cols = data_shape[1]
nor_data = np.empty((data_rows, data_cols))
for i in range(data_cols):
nor_data[:, i] = (data[:, i] - mincols[i]) / (maxcols[i] - mincols[i])
return nor_data
# Initialization parameter
def inti_GMM(dataset, K):
N, D = np.shape(dataset)
val_max = np.max(dataset, axis=0)
val_min = np.min(dataset, axis=0)
centers = np.linspace(val_min, val_max, num=K + 2)
# np.eye()函数创建对角元素值为1,其余元素值为0的数组
mus = centers[1:-1, :]
sigmas = np.array([0.5 * np.eye(D) for i in range(K)])
ws = 1.0 / K * np.ones(K)
return mus, sigmas, ws
# 计算一个高斯的pdf x: 数据 [N,D] sigma 协方差 [D,D] mu 均值 [1,D]
def getPdf(x, mu, sigma, eps=1e-12):
N, D = np.shape(x)
if D == 1:
sigma = sigma + eps
A = 1.0 / (sigma)
det = np.fabs(sigma[0])
else:
sigma = sigma + eps * np.eye(D)
A = np.linalg.inv(sigma)
det = np.fabs(np.linalg.det(sigma))
# Computing coefficient
factor = (2.0 * np.pi) ** (D / 2.0) * (det) ** (0.5)
# 计算 pdf
dx = x - mu
pdf = [(np.exp(-0.5 * np.dot(np.dot(dx[i], A), dx[i])) + eps) / factor for i in range(N)]
return pdf
def train_GMM_step(dataset, mus, sigmas, ws):
N, D = np.shape(dataset)
K, D = np.shape(mus)
# 计算样本在每个成分上的pdf
pdfs = np.zeros([N, K])
for k in range(K):
pdfs[:, k] = getPdf(dataset, mus[k], sigmas[k])
# 获取r,np.tile函数将一个已有的数组重复一定的次数
r = pdfs * np.tile(ws, (N, 1))
r_sum = np.tile(np.sum(r, axis=1, keepdims=True), (1, K))
r = r / r_sum
# 进行参数的更新
for k in range(K):
r_k = r[:, k]
N_k = np.sum(r_k)
r_k = r_k[:, np.newaxis] # [N,1]
# 更新mu
mu = np.sum(dataset * r_k, axis=0) / N_k # [D,1]
# 更新sigma
dx = dataset - mu
sigma = np.zeros([D, D])
for i in range(N):
sigma = sigma + r_k[i, 0] * np.outer(dx[i], dx[i])
sigma = sigma / N_k
# 更新w
w = N_k / N
mus[k] = mu
sigmas[k] = sigma
ws[k] = w
return mus, sigmas, ws
def train_GMM(dataset, K=2, m=10):
mus, sigmas, ws = inti_GMM(dataset, K)
for i in range(m):
# print("Step ",i)
mus, sigms, ws = train_GMM_step(dataset, mus, sigmas, ws)
return mus, sigms, ws
# 得到最大似然估计
def getlogPdfFromeGMM(datas, mus, sigmas, ws):
N, D = np.shape(datas)
K, D = np.shape(mus)
weightedlogPdf = np.zeros([N, K])
for k in range(K):
temp = getPdf(datas, mus[k], sigmas[k], eps=1e-12)
weightedlogPdf[:, k] = np.log(temp) + np.log(ws[k])
return weightedlogPdf, np.sum(weightedlogPdf, axis=1)
# 通过GMM获取簇标签
def clusterByGMM(datas, mus, sigmas, ws):
weightedlogPdf, _ = getlogPdfFromeGMM(datas, mus, sigmas, ws)
labs = np.argmax(weightedlogPdf, axis=1)
return labs
def draw_cluster(dataset, lab, colors):
plt.cla()
vals_lab = set(lab.tolist())
for i, val in enumerate(vals_lab):
index = np.where(lab == val)[0]
sub_dataset = dataset[index, :]
plt.scatter(sub_dataset[:, 0], sub_dataset[:, 1], s=16., color=colors[i])
if __name__=='__main__':
dataset=loadData('dataset/darmanis.csv')
data = dataset[1:, 2:].astype(np.float32)
data = normalization(data)
labels = np.array(dataset[1:, 1:2]).flatten()
# 降维
dim_data=TSNE(n_components=2).fit_transform(data)
colors = ['#95d0fc', '#96f97b', '#c79fef',
'#ff81c0', '#00035b', '#06c2ac',
'#ffff14', '#033500', '#c20078']
mius,sigmas,ws=train_GMM(dim_data,K=9,m=100)
gmm_labs=clusterByGMM(dim_data,mius,sigmas,ws)
km=KMeans(n_clusters=9).fit(dim_data)
km_labs=km.labels_
print(normalized_mutual_info_score(gmm_labs,labels))
print(normalized_mutual_info_score(km_labs,labels))
plt.subplot(1,2,1)
draw_cluster(dim_data,gmm_labs,colors)
plt.subplot(1, 2, 2)
draw_cluster(dim_data, km_labs, colors)
plt.show()
聚类效果
左图为GMM聚类效果,右图为KMEANS聚类效果
GMM和KMeans的比较
在特定条件下,k-means和GMM方法可以互相用对方的思想来表达。在k-means中根据距离每个点最接近的类中心来标记该点的类别,这里存在的假设是每个类簇的尺度接近且特征的分布不存在不均匀性。这也解释了为什么在使用k-means前对数据进行归一会有效果。高斯混合模型则不会受到这个约束,因为它对每个类簇分别考察特征的协方差模型。
K-means算法可以被视为高斯混合模型(GMM)的一种特殊形式。整体上看,高斯混合模型能提供更强的描述能力,因为聚类时数据点的从属关系不仅与近邻相关,还会依赖于类簇的形状。n维高斯分布的形状由每个类簇的协方差来决定。在协方差矩阵上添加特定的约束条件后,可能会通过GMM和k-means得到相同的结果。
实践中如果每个类簇的协方差矩阵绑定在一起(就是说它们完全相同),并且矩阵对角线上的协方差数值保持相同,其他数值则全部为0,这样能够生成具有相同尺寸且形状为圆形类簇。在此条件下,每个点都始终属于最近的中间点对应的类。(达观数据 陈运文)
在k-means方法中使用EM来训练高斯混合模型时对初始值的设置非常敏感。而对比k-means,GMM方法有更多的初始条件要设置。实践中不仅初始类中心要指定,而且协方差矩阵和混合权重也要设置。可以运行k-means来生成类中心,并以此作为高斯混合模型的初始条件。由此可见并两个算法有相似的处理过程,主要区别在于模型的复杂度不同。
http://www.ituring.com.cn/article/497545
4.KNN
最近邻 (k-Nearest Neighbors, KNN) 算法是一种分类算法, 1968年由 Cover和 Hart 提出, 应用场景有字符识别、 文本分类、 图像识别等领域。
该算法的思想是: 一个样本与数据集中的k个样本最相似, 如果这k个样本中的大多数属于某一个类别, 则该样本也属于这个类别。
算法流程:
1) 计算已知类别数据集中的点与当前点之间的距离
2) 按距离递增次序排序
3) 选取与当前点距离最小的k个点
4) 统计前k个点所在的类别出现的频率
5) 返回前k个点出现频率最高的类别作为当前点的预测分类
详细原理参考:https://blog.csdn.net/sinat_30353259/article/details/80901746
import numpy as np
import pandas as pd
import operator
from sklearn.model_selection import train_test_split
def loadData(filename):
data = pd.read_csv(filename)
data = data.to_numpy().T
return data
def normalization(data):
maxcols = data.max(axis=0)
mincols = data.min(axis=0)
data_rows, data_cols = data.shape
nor_data = np.zeros((data_rows, data_cols))
for i in range(data_cols):
nor_data[:, i] = (data[:, i] - mincols[i] / maxcols[i] - mincols[i])
return nor_data
def knn(train_data, test_data, labels, k):
# 计算训练样本的行数
rows = train_data.shape[0]
'''
计算每个测试样本与训练样本的欧式距离
test_data=[1,D]
train_data=[N,D]
np.tile(test_data,(rows,1))即把test_data复制rows次,使其维度和train_data相同
distances.shape=(train_data.shape[0],)
'''
distances = np.sqrt(np.sum((np.tile(test_data, (rows, 1)) - train_data) ** 2, axis=1))
# 对所得距离从低到高排序
sortDistance = np.argsort(distances)
count = {}
for i in range(k):
vote = labels[sortDistance[i]]
count[vote] = count.get(vote, 0) + 1
# 对类别出现的频数从高到低进行排序,sortCount为一个元组[(标签值,出现频数)]
sortCount = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
print(sortCount)
# 返回频数最高的标签值
return sortCount[0][0]
dataset = loadData('dataset/darmanis.csv')
data = dataset[1:, 2:].astype(np.float32)
data = normalization(data)
labels = np.array(dataset[1:, 1:2]).flatten()
train_data, test_data, train_labs, test_labs = train_test_split(data, labels, test_size=0.30, random_state=42)
k = 5
n_right = 0
n_test = test_data.shape[0]
for i in range(n_test):
test = test_data[i, :]
pre_labs = knn(train_data, test, train_labs, k)
if pre_labs == test_labs[i]:
n_right += 1
print('Sample %d lab_true=%s lab_det=%s' % (i, test_labs[i], pre_labs))
print('Accuracy=%2f%%' % (n_right * 100 / n_test))
优点:
1、简单有效
2、重新训练代价低
3、算法复杂度低
4、适合类域交叉样本
5、适用大样本自动分类
缺点:
1、惰性学习
2、类别分类不标准化
3、输出可解释性不强
4、不均衡性
5、计算量较大