tensorflow机器学习项目实战记录（聚类）

最新推荐文章于 2023-08-03 02:45:29 发布

NockinOnHeavensDoor

最新推荐文章于 2023-08-03 02:45:29 发布

阅读量1k

点赞数

分类专栏：代码碎片机器学习 tensorflow 文章标签：机器学习聚类 tensorflow

本文链接：https://blog.csdn.net/NockinOnHeavensDoor/article/details/78941141

版权

tensorflow 同时被 3 个专栏收录

64 篇文章 0 订阅

订阅专栏

机器学习

15 篇文章 0 订阅

订阅专栏

代码碎片

6 篇文章 0 订阅

订阅专栏

import sklearn.datasets
import tensorflow as tf
import time
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

数据集描述和加载

DATA_TYPE = 'blobs'
N=200
# Number of clusters, if we choose circles, only 2 will be enough
if (DATA_TYPE == 'circle'):
    K=2
else:
    K=4

# Maximum number of iterations, if the conditions are not met
MAX_ITERS = 1000

start = time.time()
centers = [(-2, -2), (-2, 1.5), (1.5, -2), (2, 1.5)]
if (DATA_TYPE == 'blobs'):
    """生成块状数据集"""
    # n_features是列的数目，center是类的中心，center_box是随机生成中心时的中心的边界
    data, features = sklearn.datasets.make_blobs(n_samples=200, n_features=2, centers=centers,
                               cluster_std=0.8, center_box=(-10.0, 10.0), shuffle=False,
                                random_state=42)
    print("data",data)
#print("features",features)
else:   
    """生成环形数据集"""
    # noise时添加到圆形数据集上的随机噪声，factor是环形数据间的比例因子
    data, features = sklearn.datasets.make_circles(n_samples=200 , shuffle= True, noise = 0.01,
                                                     random_state=None, factor = 0.4)
"""得到图和轴画中心点"""
fig, ax = plt.subplots()#
ax.scatter(np.array(centers).transpose()[0], np.asarray(centers).transpose()[1],marker='o',s = 250 )
plt.show()

"""画中心点还有生成的数据点"""
fig, ax = plt.subplots()
ax.scatter(np.asarray(centers).transpose()[0], np.asarray(centers).transpose()[1], marker = 'o', s = 250)
ax.scatter(data.transpose()[0], data.transpose()[1], marker = 'o', s = 100, c = features, cmap=plt.cm.coolwarm )
plt.show()

data [[-1.60262868 -2.11061144]
 [-1.48184917 -0.78157611]
 [-2.1873227  -2.18730957]
 .................
 [ 1.28126826  1.89353534]
 [ 0.94381343  2.96516701]
 [ 2.9435521   1.12465948]
 [ 0.62949238  2.5830979 ]
 [ 1.90836812  2.49025305]]

这里写图片描述

损失函数描述和优化循环

sess = tf.Session()
points = tf.Variable(data) # shape=(200,2)
"""N个点聚类时的标签分配存在cluster_assignments中"""
cluster_assignments = tf.Variable(tf.zeros([N],dtype=tf.int64))
"""point是tf变量，先初始化；K是聚类中心的数目，这里是4，
tf.slice中begin参数为[0,0]，表示从points最前面的元素开始切片；
size[4,2]，即前4行里每行2个元素。"""
centroids = tf.Variable(tf.slice(points.initialized_value(),[0, 0],[K, 2]))
init = tf.initialize_all_variables()
sess.run(init)

print(sess.run(centroids))

WARNING:tensorflow:From C:\Users\Xiaowang Zhang\Anaconda3\lib\site-packages\tensorflow\python\util\tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
[[-1.60262868 -2.11061144]
 [-1.48184917 -0.78157611]
 [-2.1873227  -2.18730957]
 [-0.73662975 -1.38605222]]

sess = tf.Session()
sess.run(tf.initialize_all_variables())
# 初始化centroid
sess.run(centroids)

"""损失函数 和 优化循环"""
"""在0轴上复制centroids N次，N=200，即有了200个一样的中心点（这里说的是4个不同的中心点）堆在一起。 """
rep_centroids = tf.reshape(tf.tile(centroids,[N, 1]), [N, K , 2]) 
"""tf.tile之后，points的shape是(200,8),即每一行出现了4个相同点的x,y值排在一行；
然后reshape之后内部是4*2，其中每行都是相同点的x,y值"""
rep_points = tf.reshape(tf.tile(points,[1, K]), [N, K, 2])
"""points中的每个点（每个点4个相同值表示）都减去了4个不同的中心点，然后求平方；
之后在2轴上用sum进行降维，得到每个点在减去中心点后的值的平方和，这里没有开方，还不是欧几里得距离"""
sum_squares = tf.reduce_sum(tf.square(rep_points - rep_centroids), reduction_indices= 2)
"""在1轴上的最小的数的索引（重要）被返回，即距离最近的中心点"""
best_centroids = tf.argmin(sum_squares, 1)
init = tf.initialize_all_variables()
sess.run(init)
#print("sum_squares:",sess.run(sum_squares))
print("best_centroids:",sess.run(best_centroids))
"""tf.not_equal返回一个bool类型的tensor，reduce_any在所有维度上只要存在true，就返回true，否则返回false
因为一开始四个中心点都取的前4个点，所以cluster_assignments分配的是0，即和第一个中心点（标签是0）的比较"""
did_assignments_change = tf.reduce_any(tf.not_equal(best_centroids, cluster_assignments))
def bucket_mean(data, bucket_ids, num_buckets):
    """这一步有点骚操作了，把data根据标签一样（bucket_ids里是各个点距离最近中心点的标签）
    的拿出来求和，这是unsorted_segment_sum用的太恰当"""
    total = tf.unsorted_segment_sum(data, bucket_ids, num_buckets)
    """这一步就是为了除以一个各类总数，然后得平均值，写得真是。。至于吗"""
    count = tf.unsorted_segment_sum(tf.ones_like(data), bucket_ids, num_buckets)
    return total / count
"""means中含有每一类标签的均值"""
means = bucket_mean(points,best_centroids, K)
"""计算前的控制操作，先判断did_assignments_change的真假"""
with tf.control_dependencies([did_assignments_change]):
    do_updates = tf.group(
    centroids.assign(means),
    cluster_assignments.assign(best_centroids))

changed = True
iters = 0
fig, ax = plt.subplots()
if(DATA_TYPE == 'blobs'):
    colourindexes = [2, 1, 4, 3]
else:
    colourindexes = [2, 1]
while changed and iters < MAX_ITERS:
    fig , ax = plt.subplots()
    iters += 1
    [changed, _] = sess.run([did_assignments_change, do_updates])
    [centers, assignments] = sess.run([centroids, cluster_assignments])
    ax.scatter(sess.run(points).transpose()[0], sess.run(points).transpose()[1],
              marker = 'o', s = 200, c = assignments,cmap = plt.cm.coolwarm)
    ax.scatter(centers[:, 0], centers[:,1], marker='^',s = 550, c = colourindexes,
              cmap=plt.cm.plasma)
    ax.set_title('Iteration ' + str(iters))
    plt.savefig('kmeans' + str(iters) + '.png')
ax.scatter(sess.run(points).transpose()[0], sess.run(points).transpose()[1],
          marker='o', s = 200, c = assignments, cmap=plt.cm.coolwarm)
plt.show()
end = time.time()
print("Found in %.2f seconds" % (end - start),iters, "iterations" )
print("Centroids:")
print(centers)
print("Cluster assignments:", assignments)

best_centroids: [0 1 2 3 2 2 2 2 2 2 0 2 2 2 2 1 2 0 2 2 0 2 2 1 0 0 2 3 2 1 2 2 1 1 0 1 1
 2 2 2 2 0 2 0 2 1 2 2 0 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 1 3 3 3 1 1 3 1 3]

这里写图片描述
… . .

Found in 7.20 seconds 8 iterations
Centroids:
[[ 1.65289262 -2.04643427]
 [-2.0763623   1.61204964]
 [-2.08862822 -2.07255306]
 [ 2.09831502  1.55936014]]
Cluster assignments: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 3 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]

使用tf完成最近邻算法

Knn的步骤（半监督方式）：
* 1)设定训练集的数据类别信息。
* 2)读取下一个要分类的样本,并计算从新样本到训练集中每个样本的L2距离。
* 3)欧几里得距离最近的K个样本投票决定新样本的类别信息。
* 4)重复直到新样本被完全确定。

import tensorflow as tf
import numpy as np
import time
%matplotlib inline
import matplotlib.pyplot as plt
from  sklearn.datasets.samples_generator import make_circles

N = 210
K = 2
MAX_ITERS = 1000
cut = int(N * 0.7)

start = time.time()
# factor是环形数据间的比例因子；noise是随机噪声
data, features = make_circles(n_samples=N, shuffle=True, noise= 0.12, factor= 0.4)
tr_data , tr_features = data[:cut], features[:cut]
te_data, te_features = data[cut:], features[cut:]
#print(data)
print('-------------')
#print(features)
fig, ax = plt.subplots()
"""data是按列排的，所以转置之后画图"""
ax.scatter(tr_data.transpose()[0],tr_data.transpose()[1], marker='p',s=100,c=tr_features,cmap=plt.cm.coolwarm)
plt.plot()

points = tf.Variable(data)
cluster_assignments = tf.Variable(tf.zeros([N], dtype=tf.int64))

sess = tf.Session()
sess.run(tf.initialize_all_variables())
test = []

for i, j in zip(te_data, te_features):
    """求需要预测的点到训练集中每个点的距离"""
    dictances = tf.reduce_sum(tf.square(tf.subtract(i, tr_data)),reduction_indices=1)
    """这里算的是距离最近的一个点，由着一个点决定了测试样本的类别"""
    neighbor = tf.arg_min(dictances,dimension=0)
    test.append(tr_features[sess.run(neighbor)])

print("Cluster assignments: ",test)

fig, ax = plt.subplots()
ax.scatter(te_data.transpose()[0], te_data.transpose()[1], marker='o',s =100,c=test,cmap=plt.cm.coolwarm)
plt.plot()

end = time.time()
print("Found in %.f seconds" % (end-start))
print("numbers of false Cluster: ", sum(abs(test-te_features)))

-------------
Cluster assignments:  [0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0]
Found in 7 seconds
numbers of false Cluster:  1

这里写图片描述