最近继续在用MNIST跑一些简单的MLP玩,发现当把隐节点数从300提高到1000之后训练loss就变成了全程NaN,在网上搜了搜发现原因因为交叉熵loss中用到了log函数,当输入为0时就会出现NaN。一个直接的结局方法是在log的输入中加上一个非常小的数(e.g. 1e-10)。不过如此修改之后,我发现测试loss虽然不是NaN了,却变成了始终振荡,原因是学习率过大所导致的,将学习率调低一个数量级,网络就能够正常收敛了。
测试代码:
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import time
start =time.clock()
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_DATA/", one_hot = True)
import tensorflow as tf
tf.reset_default_graph()
import random as rd
sess = tf.InteractiveSession()
log_dir = 'C:/TF_LOGS/myMLP4'
in_units = 784
learning_rate = 0.001
dataset_size = 5000
batch_size = 100
[h1_units, h2_units, h3_units, h4_units] = [1000, 1000, 1000,1000]
W1 = tf.Variable(tf.truncated_normal(shape = [in_units, h1_units], mean = 0, stddev = 0.1))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(0.001)(W1))
b1 = tf.Variable(tf.zeros(shape = [h1_units]))
W2 = tf.Variable(tf.truncated_normal(shape = [h1_units, h2_units], mean = 0, stddev = 0.1))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(0.001)(W2))
b2 = tf.Variable(tf.zeros(shape = [h2_units]))
W3 = tf.Variable(tf.truncated_normal(shape = [h2_units, h3_units], mean = 0, stddev = 0.1))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(0.001)(W2))
b3 = tf.Variable(tf.zeros(shape = [h3_units]))
W4 = tf.Variable(tf.truncated_normal(shape = [h3_units, h4_units], mean = 0, stddev = 0.1))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(0.001)(W2))
b4 = tf.Variable(tf.zeros(shape = [h4_units]))
W5 = tf.Variable(tf.truncated_normal(shape = [h4_units, 10], mean = 0, stddev = 0.1))
#tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(0.001)(W2))
b5 = tf.Variable(tf.zeros(shape = [10]))
x = tf.placeholder(dtype = tf.float32, shape = [None, in_units])
keep_prob = tf.placeholder(dtype = tf.float32)
h1 = tf.nn.relu(tf.matmul(x,W1)+b1)
h2 = tf.nn.relu(tf.matmul(h1,W2)+b2)
h3 = tf.nn.relu(tf.matmul(h2,W3)+b3)
h4 = tf.nn.relu(tf.matmul(h3,W4)+b4)
h4_drop = tf.nn.dropout(h4, keep_prob)
y = tf.nn.softmax(tf.matmul(h4_drop, W5)+b5)
y_ =tf.placeholder(tf.float32, [None,10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y+ 1e-10 ), reduction_indices=[1]))
#cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y ), reduction_indices=[1]))
tf.add_to_collection('losses', cross_entropy)
total_loss = tf.add_n(tf.get_collection('losses'))
cross_entropy_train = tf.summary.scalar('cross_entropy_train', cross_entropy)
cross_entropy_test = tf.summary.scalar('cross_entropy_test', cross_entropy)
train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)
if_correct = tf.equal(tf.argmax(y,1), tf.argmax(y_, 1))
acc = tf.reduce_mean(tf.cast(if_correct, tf.float32))
acc_test = tf.summary.scalar('acc_test', acc)
train_writer = tf.summary.FileWriter(log_dir, sess.graph)
saver = tf.train.Saver()
xs, ys = mnist.train.next_batch(dataset_size)
tf.global_variables_initializer().run()
for i in range(50000):
index = rd.sample(range(0, dataset_size), batch_size)
batch_xs = xs[index]
batch_ys = ys[index]
_, sum_cross_entropy_train = sess.run([train_step, cross_entropy_train], feed_dict={x: batch_xs, y_: batch_ys, keep_prob: .5})
train_writer.add_summary(sum_cross_entropy_train, i)
if i%100 == 1:
sum_cross_entropy_test, sum_acc_test = sess.run([acc_test, cross_entropy_test], feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
train_writer.add_summary(sum_acc_test, i)
train_writer.add_summary(sum_cross_entropy_test, i)
# saver.save(sess, log_dir+'/model.ckpt', i)
train_writer.close()
print(acc.eval({x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
#
#$ tensorboard --logdir=C:/TF_LOGS/myMLP4
#浏览器输入:http://DESKTOP-GJKI5V4:6006