tensorflow 下kmeans实现



from __future__ import print_function
import numpy as np
import tensorflow as tf
from tensorflow.contrib.factorization import KMeans

from tensorflow.examples.tutorials.mnist import input_data
#mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
mnist = input_data.read_data_sets("../MNIST_data", one_hot=True)
full_data_x = mnist.train.images

# Parameters
num_steps = 50  # total step numbers of training
batch_size = 1024  # the number of samples per batch
k = 20  # the number of clusters
num_classes = 10  # including 10 results, there are 1, 1, 2, 3, 4, 5, 6, 7, 8, 9
num_features = 784  # get it from pictures, whose size is 28*28.


X = tf.placeholder(dtype=tf.float32, shape=[None, num_features])
Y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes])
lst_distance = []
lst_accu_train = []
lst_accu_test = []



def get_k(k, sess):
    print("%d clusters"%k)
    kmeans = KMeans(inputs=X, num_clusters=k, distance_metric='cosine',use_mini_batch=True)

    training_graph = kmeans.training_graph()

    if len(training_graph) > 6:
        (all_scores, cluster_idx, scores, cluster_centers_initialized,
        cluster_centers_var, init_op, train_op) = training_graph
    else:
        (all_scores, cluster_idx, scores, cluster_centers_initialized,
        init_op, train_op) = training_graph

    cluster_idx = cluster_idx[0]
    avg_distance = tf.reduce_mean(scores)
    init_vars = tf.global_variables_initializer()

    #sess = tf.Session()
    sess.run(init_vars, feed_dict={X: full_data_x})
    sess.run(init_op, feed_dict={X: full_data_x})

    for i in range(1, num_steps+1):
        _, d, idx = sess.run([train_op, avg_distance, cluster_idx], feed_dict={X: full_data_x})
        if i%10 == 0 or i == 1:
            print("step %i, avg distance: %f"%(i, d))
    lst_distance.append(d)

    counts = np.zeros(shape=(k, num_classes))
    for i in range(len(idx)):
        counts[idx[i]] += mnist.train.labels[i]    #对第idx[i]个簇的统计加1.
    labels_map = [np.argmax(c) for c in counts]    #大数判决
    labels_map = tf.convert_to_tensor(labels_map)

    cluster_label = tf.nn.embedding_lookup(labels_map, cluster_idx)
    labels_get = sess.run(cluster_label, feed_dict={X: full_data_x})
    '''****begin****'''
    #this part of code finishes printing wrong samples
    #num means the index of sample
    #tf.argmax(Y[i]) means the true label of the index of sample i.
    #labels_get[i] means the label from kmeans of the index of sample i. 
    num = 0
    for i in range(len(result)):
        if not result[i]:
            num += 1
            print("num: %d target: %d label: %d"%(i, sess.run(tf.argmax(Y[i]), feed_dict={X: full_data_x, Y: mnist.train.labels}), labels_get[i]))
            #if num >= 100:
                #break
    '''*****end*****'''
    correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(Y, 1), tf.int32))
    accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    test_x, test_y = mnist.test.images, mnist.test.labels
    accu_train = sess.run(accuracy_op, feed_dict={X: full_data_x, Y: mnist.train.labels})
    accu_test = sess.run(accuracy_op, feed_dict={X: test_x, Y: test_y})
    lst_accu_train.append(accu_train)
    lst_accu_test.append(accu_test)
    #print("train accuracy:", sess.run(accuracy_op, feed_dict={X: full_data_x, Y: mnist.train.labels}))
    #print("test accuracy:", sess.run(accuracy_op, feed_dict={X: test_x, Y: test_y}))

def visual_lst(x, dir, string, begin, stride, sess):
    loss = 0.0
    # tf.summary模块的定义位于summary.py文件中,该文件中主要定义了在进行可视化将要用到的各种函数
    loss_summary = tf.Summary()
    # 调用tf.summary.Summary.Value子类
    loss_summary.value.add(tag=string, simple_value=loss)  # tag就是待会产生的图标名称

    #with tf.Session() as sess:
        # 生成一个写日志的writer,将当前tensorflow计算图写入日志。
    summary_writer1 = tf.summary.FileWriter(dir, sess.graph)

    sess.run(tf.global_variables_initializer())
    for i in range(len(x)):
        #print(x[i])
        # 固定用法,具体为什么我也不懂
        loss_summary.value[0].simple_value = x[i]
        summary_writer1.add_summary(loss_summary, begin+i*stride)

if __name__ == "__main__":
    stride = 2
    begin = 1
    sess = tf.Session()
    for i in range(begin, 56, stride):
        get_k(i, sess)
    visual_lst(lst_distance, "./distance", "distance", begin, stride, sess)
    visual_lst(lst_accu_train, "./train_accuracy", "train_accuracy", begin, stride, sess)
    visual_lst(lst_accu_test, "./test_accuracy", "test_accuracy", begin, stride, sess)

本文主要是从tensorflow的角度实现kmeans,中间借鉴了TensorFlow-Examples的代码。

但是这里有几个问题没有解决:

1、怎样评判应该使用的簇数目,看一些文档上说利用想均方差之类的方法,取距离变化梯度最陡的值为K,但是对于本图来说,这个怎么看?

2、单纯的看,随着K值增加,训练集和测试集的准确度都在增加,那是否应该不断增加K值呢?只是使用准确度是否合理?如果不合理,使用什么标尺比较好?

3、因为对于tensorflow不够熟悉,本来希望把错误的样本都通过tensorboard显示出来,方便进行改进,但是因为不会遍历样本集,导致该工作失败。(已想出一种简单解决方案)

如果有人了解相关答案,还请不吝赐教。

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值