以下是K-Means算法的具体TensorFlow代码:
#作者:宝蓓
#名称:K-means算法
#代码思路:基本K-Means算法:1、首先确定常数K,常数K意味着最终的聚类类别数;
# 2、随机选定初始点为质心,并通过计算每一个样本与质心之间的相似度(这里为欧式距离),将样本点归到最相似的类中;
# 3、接着,重新计算每个类的质心(即为类中心)
# 4、重复这样的过程,直到质心不再改变,最终就确定了每个样本所属的类别以及每个类的质心。
#选用的数据集:iris数据集,数据集已明确标注为3类
#1、首先先导入必要的编程库。
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn import datasets
from scipy.spatial import cKDTree
#PCA主要是用来数据降维,将高维度的特征映射到低维度的特征,加快机器学习的速度。
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
#2、创建一个计算图会话,加载iris数据集
sess = tf.Session()
iris = datasets.load_iris()
#num_pts表示数据的总数
num_pts = len(iris.data)
#每一个样本所含的特征数,即本例中每个样本中含有4个特征
num_feats = len(iris.data[0])
#3、设置分类树
k=3
#设置迭代次数
generations = 25
#创建计算图所需的变量,初始化点和每个点所含的标签(即每个点属于哪一类)
data_points = tf.Variable(iris.data)
cluster_labels = tf.Variable(tf.zeros([num_pts], dtype=tf.int64))
#4、声明并初始化每个分组所需的几何中心,本例中选择3个数据集来初始化算法的几何中心
#for _ in range(k),做range(k)次iris.data[np.random.choice(len(iris.data))]的循环,即做k次,随机选取数据点的循环
rand_starts = np.array([iris.data[np.random.choice(len(iris.data))] for _ in range(k)])
centroids = tf.Variable(rand_starts)
#5、计算每个数据点到每个几何中心的距离。
#本例所使用的是:将几何中心点和数据点分别放入矩阵中,然后计算两个矩阵的欧氏距离
#centroid_matrix表示几何中心矩阵
#tf.tile(centroids, [num_pts, 1])这行代码,表示,将矩阵中心复制150次,有3个矩阵中心,所以这里有450行,每一行有4个特征
#tf.reshape(tf.tile(centroids, [num_pts, 1]), [num_pts, k, num_feats])表示将前面得到的 450 * 4的矩阵变为150*3*4的矩阵,表示分为150组,每组中有3个向量,每个向量里含有4个值
centroid_matrix = tf.reshape(tf.tile(centroids, [num_pts, 1]), [num_pts, k, num_feats])
#tf.tile(data_points, [1,k])将data_points中的列复制3次,也就是原来是4列,现在变为12列,变为150*12
#然后 reshape成 150*3*4,仍旧是150组,每组中有3个向量(这3个向量是相同的),每个向量有4个特征值
point_matrix = tf.reshape(tf.tile(data_points, [1,k]), [num_pts, k, num_feats])
#于是对这2个矩阵计算其欧氏距离
#reduction_indices = 2表示对于 a*b*c矩阵,把每个向量加起来成为一个新值,于是distance=150*3
distances = tf.reduce_sum(tf.square(centroid_matrix - point_matrix), reduction_indices = 2)
#6、分配时,是以到每个数据点最小距离为最接近的几何中心点
#tf.argmin(input, dimension, name=None)
#tf.argmin是返回input最小值的索引index,而dimension=1,表示在第2个维度进行求解(当dimension=0时,表示第一个维度),所以得到的是150个索引号(0,1,2),0表示和第一个几何中心比较近,应该放在第1类中
centroid_group = tf.argmin(distances, 1)
#7、计算每组分类的平均距离得到新的几何中心
def data_group_avg(group_ids, data):
sum_total = tf.unsorted_segment_sum(data, group_ids, 3)
#tf.ones_like是用于创建一个所有参数均为 1 的tensor对象
num_total = tf.unsorted_segment_sum(tf.ones_like(data), group_ids, 3)
avg_by_group = sum_total / num_total
return(avg_by_group)
means = data_group_avg(centroid_group, data_points)
update = tf.group(centroids.assign(means), cluster_labels.assign(centroid_group))
#8、初始化模型
init = tf.global_variables_initializer()
sess.run(init)
#9、遍历迭代训练,更新每组分类的几何中心点
for i in range(generations):
print('Calculating gen {},out of {}.'.format(i, generations))
_, centroid_group_count = sess.run([update, centroid_group])
group_count = []
for ix in range(k):
group_count.append(np.sum(centroid_group_count==ix))
print('Group counts:{}'.format(group_count))
#10、验证实际数据集与聚类的数据集有多少是匹配的
[centers, assignments] = sess.run([centroids, cluster_labels])
def most_common(my_list):
return(max(set(my_list), key=my_list.count))
label0 = most_common(list(assignments[0:50]))
label1 = most_common(list(assignments[50:100]))
label2 = most_common(list(assignments[100:150]))
group0_count = np.sum(assignments[0:50]==label0)
group1_count = np.sum(assignments[50:100]==label1)
group2_count = np.sum(assignments[100:150]==label2)
accuracy = (group0_count + group1_count + group2_count)/150.
print('Accuracy:{:.2}'.format(accuracy))
#以下是将聚类结果可视化出来
#PCA(n_components=2)表示将4个特征的向量降维到二维,即可以画在平面
pca_model = PCA(n_components=2)
#将iris.data转换成标准形式,然后存入reduced_data中
reduced_data = pca_model.fit_transform(iris.data)
#将前面的几何中心点centers也转换成标准形式,然后存入reduced_centers中
reduced_centers = pca_model.transform(centers)
#h表示间距
h = .02
#下面求x_min, x_max和y_min, y_max,主要是为了确定坐标轴
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
xx_pt = list(xx.ravel())
yy_pt = list(yy.ravel())
xy_pts = np.array([[x,y] for x,y in zip(xx_pt, yy_pt)])
mytree = cKDTree(reduced_centers)
dist, indexes = mytree.query(xy_pts)
indexes =indexes.reshape(xx.shape)
#下面使用matplotlib将图给画出来
plt.clf()
plt.imshow(indexes, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower')
symbols = ['o', '^', 'D']
label_name = ['Setosa', 'Versicolour', 'Virginica']
for i in range(3):
temp_group = reduced_data[(i*50) : (50)*(i+1)]
plt.plot(temp_group[:, 0], temp_group[:, 1], symbols[i], markersize=10, label=label_name[i])
plt.scatter(reduced_centers[:, 0], reduced_centers[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10)
plt.title('K-means clustering on Iris Dataset\n' 'Centroids are marked with wthite cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='lower right')
plt.show()
最后输出的结果有:
Calculating gen 0,out of 25.
Group counts:[9, 72, 69]
Calculating gen 1,out of 25.
Group counts:[36, 63, 51]
Calculating gen 2,out of 25.
Group counts:[41, 59, 50]
Calculating gen 3,out of 25.
Group counts:[46, 54, 50]
Calculating gen 4,out of 25.
Group counts:[50, 50, 50]
Calculating gen 5,out of 25.
Group counts:[54, 46, 50]
Calculating gen 6,out of 25.
Group counts:[57, 43, 50]
Calculating gen 7,out of 25.
Group counts:[60, 40, 50]
Calculating gen 8,out of 25.
Group counts:[61, 39, 50]
Calculating gen 9,out of 25.
Group counts:[61, 39, 50]
Calculating gen 10,out of 25.
Group counts:[61, 39, 50]
Calculating gen 11,out of 25.
Group counts:[61, 39, 50]
Calculating gen 12,out of 25.
Group counts:[61, 39, 50]
Calculating gen 13,out of 25.
Group counts:[61, 39, 50]
Calculating gen 14,out of 25.
Group counts:[61, 39, 50]
Calculating gen 15,out of 25.
Group counts:[61, 39, 50]
Calculating gen 16,out of 25.
Group counts:[61, 39, 50]
Calculating gen 17,out of 25.
Group counts:[61, 39, 50]
Calculating gen 18,out of 25.
Group counts:[61, 39, 50]
Calculating gen 19,out of 25.
Group counts:[61, 39, 50]
Calculating gen 20,out of 25.
Group counts:[61, 39, 50]
Calculating gen 21,out of 25.
Group counts:[61, 39, 50]
Calculating gen 22,out of 25.
Group counts:[61, 39, 50]
Calculating gen 23,out of 25.
Group counts:[61, 39, 50]
Calculating gen 24,out of 25.
Group counts:[61, 39, 50]
Accuracy:0.89