from __future__ import print_function
import numpy as np
import tensorflow as tf
from tensorflow.contrib.factorization import KMeans
from tensorflow.examples.tutorials.mnist import input_data
#mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
mnist = input_data.read_data_sets("../MNIST_data", one_hot=True)
full_data_x = mnist.train.images
# Parameters
num_steps = 50 # total step numbers of training
batch_size = 1024 # the number of samples per batch
k = 20 # the number of clusters
num_classes = 10 # including 10 results, there are 1, 1, 2, 3, 4, 5, 6, 7, 8, 9
num_features = 784 # get it from pictures, whose size is 28*28.
X = tf.placeholder(dtype=tf.float32, shape=[None, num_features])
Y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes])
lst_distance = []
lst_accu_train = []
lst_accu_test = []
def get_k(k, sess):
print("%d clusters"%k)
kmeans = KMeans(inputs=X, num_clusters=k, distance_metric='cosine',use_mini_batch=True)
training_graph = kmeans.training_graph()
if len(training_graph) > 6:
(all_scores, cluster_idx, scores, cluster_centers_initialized,
cluster_centers_var, init_op, train_op) = training_graph
else:
(all_scores, cluster_idx, scores, cluster_centers_initialized,
init_op, train_op) = training_graph
cluster_idx = cluster_idx[0]
avg_distance = tf.reduce_mean(scores)
init_vars = tf.global_variables_initializer()
#sess = tf.Session()
sess.run(init_vars, feed_dict={X: full_data_x})
sess.run(init_op, feed_dict={X: full_data_x})
for i in range(1, num_steps+1):
_, d, idx = sess.run([train_op, avg_distance, cluster_idx], feed_dict={X: full_data_x})
if i%10 == 0 or i == 1:
print("step %i, avg distance: %f"%(i, d))
lst_distance.append(d)
counts = np.zeros(shape=(k, num_classes))
for i in range(len(idx)):
counts[idx[i]] += mnist.train.labels[i] #对第idx[i]个簇的统计加1.
labels_map = [np.argmax(c) for c in counts] #大数判决
labels_map = tf.convert_to_tensor(labels_map)
cluster_label = tf.nn.embedding_lookup(labels_map, cluster_idx)
labels_get = sess.run(cluster_label, feed_dict={X: full_data_x})
'''****begin****'''
#this part of code finishes printing wrong samples
#num means the index of sample
#tf.argmax(Y[i]) means the true label of the index of sample i.
#labels_get[i] means the label from kmeans of the index of sample i.
num = 0
for i in range(len(result)):
if not result[i]:
num += 1
print("num: %d target: %d label: %d"%(i, sess.run(tf.argmax(Y[i]), feed_dict={X: full_data_x, Y: mnist.train.labels}), labels_get[i]))
#if num >= 100:
#break
'''*****end*****'''
correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(Y, 1), tf.int32))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
test_x, test_y = mnist.test.images, mnist.test.labels
accu_train = sess.run(accuracy_op, feed_dict={X: full_data_x, Y: mnist.train.labels})
accu_test = sess.run(accuracy_op, feed_dict={X: test_x, Y: test_y})
lst_accu_train.append(accu_train)
lst_accu_test.append(accu_test)
#print("train accuracy:", sess.run(accuracy_op, feed_dict={X: full_data_x, Y: mnist.train.labels}))
#print("test accuracy:", sess.run(accuracy_op, feed_dict={X: test_x, Y: test_y}))
def visual_lst(x, dir, string, begin, stride, sess):
loss = 0.0
# tf.summary模块的定义位于summary.py文件中,该文件中主要定义了在进行可视化将要用到的各种函数
loss_summary = tf.Summary()
# 调用tf.summary.Summary.Value子类
loss_summary.value.add(tag=string, simple_value=loss) # tag就是待会产生的图标名称
#with tf.Session() as sess:
# 生成一个写日志的writer,将当前tensorflow计算图写入日志。
summary_writer1 = tf.summary.FileWriter(dir, sess.graph)
sess.run(tf.global_variables_initializer())
for i in range(len(x)):
#print(x[i])
# 固定用法,具体为什么我也不懂
loss_summary.value[0].simple_value = x[i]
summary_writer1.add_summary(loss_summary, begin+i*stride)
if __name__ == "__main__":
stride = 2
begin = 1
sess = tf.Session()
for i in range(begin, 56, stride):
get_k(i, sess)
visual_lst(lst_distance, "./distance", "distance", begin, stride, sess)
visual_lst(lst_accu_train, "./train_accuracy", "train_accuracy", begin, stride, sess)
visual_lst(lst_accu_test, "./test_accuracy", "test_accuracy", begin, stride, sess)
本文主要是从tensorflow的角度实现kmeans,中间借鉴了TensorFlow-Examples的代码。
但是这里有几个问题没有解决:
1、怎样评判应该使用的簇数目,看一些文档上说利用想均方差之类的方法,取距离变化梯度最陡的值为K,但是对于本图来说,这个怎么看?
2、单纯的看,随着K值增加,训练集和测试集的准确度都在增加,那是否应该不断增加K值呢?只是使用准确度是否合理?如果不合理,使用什么标尺比较好?
3、因为对于tensorflow不够熟悉,本来希望把错误的样本都通过tensorboard显示出来,方便进行改进,但是因为不会遍历样本集,导致该工作失败。(已想出一种简单解决方案)
如果有人了解相关答案,还请不吝赐教。