算法思想
使得内距离最小,使得类间距离最大。
算法流程
算法细节
距离度量
我们这里用到了欧氏距离和曼哈顿距离。
d
(
i
,
j
)
=
∑
f
=
1
p
δ
i
j
(
f
)
d
i
j
(
f
)
∑
f
=
1
p
δ
i
j
(
f
)
d(i, j)=\frac{\sum_{f=1}^{p} \delta_{i j}^{(f)} d_{i j}^{(f)}}{\sum_{f=1}^{p} \delta_{i j}^{(f)}}
d(i,j)=∑f=1pδij(f)∑f=1pδij(f)dij(f)
Euclidean Distance
dist
=
∑
k
=
1
n
(
p
k
−
q
k
)
2
\begin{aligned} &\text { Euclidean Distance }\\ &\text {dist}=\sqrt{\sum_{k=1}^{n}\left(p_{k}-q_{k}\right)^{2}} \end{aligned}
Euclidean Distance dist=k=1∑n(pk−qk)2
Manhattan Distance
dist
=
∑
k
=
1
n
∣
p
k
−
q
k
∣
\begin{aligned} &\text { Manhattan Distance }\\ &\text {dist}=\sum_{k=1}^{n}\left|p_{k}-q_{k}\right| \end{aligned}
Manhattan Distance dist=k=1∑n∣pk−qk∣
算法评估
SSE评估法
把n个数据对象划分到k个簇中去,满足每个点到其聚类中心的距离的平方和最小。
S
S
E
=
∑
i
=
1
k
Σ
p
∈
C
i
(
p
−
c
i
)
2
\mathrm{SS} E=\sum_{i=1}^{k} \Sigma_{p \in C_{i}}\left(p-c_{i}\right)^{2}
SSE=i=1∑kΣp∈Ci(p−ci)2
sum = 0
for i in range(self.k_):
for j in range(len(self.clf_[i])):
sum += np.linalg.norm(self.clf_[i][j] - self.centers_[i]) ** 2
轮廓系数法
算 a(i) = average(i向量到所有它属于的簇中其它点的距离)
计算 b(i) = min (i向量到与它相邻最近的一簇内的所有点的平均距离)
那么 i 向量轮廓系数就为:
S
(
i
)
=
b
(
i
)
−
a
(
i
)
max
{
a
(
i
)
,
b
(
i
)
}
S(i)=\frac{b(i)-a(i)}{\max \{a(i), b(i)\}}
S(i)=max{a(i),b(i)}b(i)−a(i)
可见轮廓系数的值是介于 [-1,1] ,越趋近于1代表内聚度和分离度都相对较优。
将所有点的轮廓系数求平均,就是该聚类结果总的轮廓系数。
a(i) :i向量到同一簇内其他点不相似程度的平均值
b(i) :i向量到其他簇的平均不相似程度的最小值
metrics.silhouette_score(self.data, self.labels, metric='euclidean')
算法可视化
数据降维
t-SNE 是一种非线性降维算法,非常适用于高维数据降维到2维或者3维,进行可视化。该算法可以将对于较大相似度的点,t分布在低维空间中的距离需要稍小一点;而对于低相似度的点,t分布在低维空间中的距离需要更远。
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) # TSNE降维,降到2
# 降维后的数据
low_dim_embs = tsne.fit_transform(np.array(list_all))
进行可视化
def visualization(self):
list_all = self.centers_
for i in range(self.k_):
list_all = list_all + self.clf_[i]
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) # TSNE降维,降到2
# 降维后的数据
low_dim_embs = tsne.fit_transform(np.array(list_all))
plt.title('K: {0}, Dist: {1}, Seed: {2}, Standardization: {3}'.format(self.k_, self.dist_type, self.seed, self.std))
cnt = 0
for center in range(self.k_):
plt.scatter(low_dim_embs[cnt][0], low_dim_embs[cnt][1], c=self.color_list[center], marker='*', s=150)
cnt += 1
for cat in range(self.k_):
for _ in self.clf_[cat]:
plt.scatter(low_dim_embs[cnt][0], low_dim_embs[cnt][1], c=self.color_list[cat])
cnt += 1
plt.show()
代码实现
数据下载:
链接:https://pan.baidu.com/s/1iy1S5v8pk3fMW-8M_gVs0Q
提取码:1dw3
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
from sklearn.manifold import TSNE
from sklearn import metrics
class K_Means(object):
# k是分组数;tolerance‘中心点误差’;max_iter是迭代次数
def __init__(self, k=2, tolerance=0.0001, max_iter=300, dist_type='Euclidean', seed=10, std=False, type='SSE'):
self.k_ = k
self.tolerance_ = tolerance
self.max_iter_ = max_iter
self.dist_type = dist_type
self.color_list = ['r', 'b', 'yellow', 'purple', 'black', 'pink']
self.seed = seed
self.std = std
self.type = type
random.seed(self.seed)
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为100,默认为50
pd.set_option('max_colwidth', 500)
# 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def fit(self, data):
if self.std:
# 两种不同标准化处理
# for col in range(data.shape[1]):
# data[:, col] = (data[:, col] - data[:, col].mean()) / data[:, col].std()
for col in range(data.shape[1]):
data[:, col] = (data[:, col] - data[:, col].min()) / (data[:, col].max() -data[:, col].min())
self.centers_ = random.sample(data.tolist(), self.k_)
self.data = data
self.labels = np.zeros(len(self.data))
for i in range(self.max_iter_):
self.clf_ = []
for j in range(self.k_):
self.clf_.append([])
# print("质点:",self.centers_)
cnt = 0
for feature in data: # 遍历所有数据点找到每个数据点距离最近的中心
# distances = [np.linalg.norm(feature-self.centers[center]) for center in self.centers]
distances = []
for j in range(self.k_): # 遍历所有中心点,该数据距离那个中心最近
# 欧拉距离
if self.dist_type == 'Euclidean':
distances.append(np.linalg.norm(feature - self.centers_[j]))
# 曼哈顿距离
elif self.dist_type == 'Manhattan':
distances.append(np.linalg.norm(feature - self.centers_[j], ord=1))
classification = distances.index(min(distances))
self.clf_[classification].append(feature)
self.labels[cnt] = classification
cnt += 1
# print("分组情况:", self.clf_)
prev_centers = self.centers_.copy()
for i in range(self.k_):
self.centers_[i] = np.average(self.clf_[i], axis=0)
# '中心点'是否在误差范围,如果这个值过小,说明已经无法更新,应该停止算法
optimized = True
for i in range(self.k_):
bef_centers = prev_centers[i]
cur_centers = self.centers_[i]
# print(bef_centers)
if np.sum((cur_centers - bef_centers)) > self.tolerance_: # / bef_centers * 100.0
optimized = False
if optimized:
break
def visualization(self):
list_all = self.centers_
for i in range(self.k_):
list_all = list_all + self.clf_[i]
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) # TSNE降维,降到2
# 降维后的数据
low_dim_embs = tsne.fit_transform(np.array(list_all))
plt.title('K: {0}, Dist: {1}, Seed: {2}, Standardization: {3}'.format(self.k_, self.dist_type, self.seed, self.std))
cnt = 0
for center in range(self.k_):
plt.scatter(low_dim_embs[cnt][0], low_dim_embs[cnt][1], c=self.color_list[center], marker='*', s=150)
cnt += 1
for cat in range(self.k_):
for _ in self.clf_[cat]:
plt.scatter(low_dim_embs[cnt][0], low_dim_embs[cnt][1], c=self.color_list[cat])
cnt += 1
plt.show()
def evaluation(self):
if self.type == 'SSE':
sum = 0
for i in range(self.k_):
for j in range(len(self.clf_[i])):
sum += np.linalg.norm(self.clf_[i][j] - self.centers_[i])**2
return sum
else:
# 轮廓系数法
return metrics.silhouette_score(self.data, self.labels, metric='euclidean')
def predict(self, p_data):
distances = [np.linalg.norm(p_data - self.centers_[center]) for center in range(self.k_)]
index = distances.index(min(distances))
return index
# 在神经网络中,我们最后一层一般都是高纬度的数据,但是有时候我们可能想看一下这些高纬度数据的分布情况,这个时候就需要用TSNE,其实TSNE本质上就是先利用PCA降维,比如降到二维或者三维,做法就是把神经网络某层的特征降维,然后映射到二维或者三维空间可视化。这样就能看到各层特征分布情况。
if __name__ == '__main__':
data = pd.read_csv('test.csv', na_values='NULL')
data = data.fillna(0)
data1 = []
for i in range(len(data)):
tmp = data.iloc[i].tolist()
tmp = [1 if x == 'Good' else x for x in tmp]
tmp = [0 if x == 'Poor' else x for x in tmp]
data1.append(tmp)
x = np.array(data1)
save_list = [[], [], [], [], []]
best = -99999999
best_info = ()
for k in [2, 3, 4, 5, 6]:
for dist in ['Euclidean', 'Manhattan']:
for seed in [10, 27, 43]:
for std in [True, False]:
k_means = K_Means(k=k, dist_type=dist, seed=seed, std=std)
k_means.fit(x.copy())
evaluation = k_means.evaluation()
# print('K: {0}, Dist: {1}, Seed: {2}, Standardization: {3}, evaluation: {4}'.format(k, dist, seed, std, evaluation))
# 保存所有信息
save_list[0].append(k)
save_list[1].append(dist)
save_list[2].append(seed)
save_list[3].append(std)
save_list[4].append(evaluation)
if(evaluation > best and k_means.type != 'SSE') or (-evaluation > best and k_means.type == 'SSE'):
best = -evaluation
best_info = (k, dist, seed, std, evaluation)
res = 'K: {0}, Dist: {1}, Seed: {2}, Standardization: {3}, evaluation: {4}'.format(k, dist, seed, std, evaluation)
print('-------------------best-------------------')
print(res)
(k, dist, seed, std, _) = best_info
k_means = K_Means(k=k, dist_type=dist, seed=seed, std=std)
k_means.fit(x.copy())
# k_means.visualization()
# 将所有信息存为excel
pd.DataFrame({'K': save_list[0],
'Dist': save_list[1],
'Seed': save_list[2],
'Standardization': save_list[3],
'evaluation': save_list[4]}).to_excel('info.xlsx')