# Kevin Xu-TensorFlow Tutorials-cifar10 (2)

Kevin知乎：点击打开链接

1. 这部分代码的使用方法，如果你不想知道怎么写的，可以直接运行。文尾附上全部代码

# 0. you need to change the data directory（目录），文中所有涉及地址的地方你都要改成自己的
# 1. run cifar10-MAIN.py
# 2. call train() in the console to train the model
# 3. call evaluate() in the console to test on the test data

-----------------------------------------------代码分解---------------------------------------------------

1.导入包

import os
import os.path
import math
import numpy as np
import tensorflow as tf
import input_data # 之前写的数据处理

2.设置一些超参数

# 设置一些超参数：batch大小，学习速率以及最大迭代次数
BATCH_SIZE = 128
learning_rate = 0.05
MAX_STEP = 100000
# with this setting, it took less than 30 mins on my laptop to train.越大训练越久，先可以训练小一点如10000
# tensorflow一样迭代100k次，可能会很久

3.网络架构

def inference(images):
'''
Args:
images: 4D tensor [batch_size, img_width, img_height, img_channel]
Notes:
In each conv layer, the kernel size is:
[kernel_size, kernel_size, number of input channels, number of output channels].
number of input channels are from previuous layer, if previous layer is THE input
layer, number of input channels should be image's channels.
'''
# conv1, [5, 5, 3, 96], The first two dimensions are the patch size,
# the next is the number of input channels,
# the last is the number of output channels
# 输入image size = [batch_size,32,32,3]
with tf.variable_scope('conv1') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 3, 96],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
# 这里用的标注差为0.05，同tensorflow官网一样
biases = tf.get_variable('biases',
shape=[96],
dtype=tf.float32,
initializer=tf.constant_initializer(0.0))
conv = tf.nn.conv2d(images, weights, strides=[1, 1, 1, 1], padding='SAME')
conv1 = tf.nn.relu(pre_activation, name=scope.name)  # scope.name = conv1
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# pool1 and norm1
with tf.variable_scope('pooling1_lrn') as scope:
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
#  ksize=[1, 3, 3, 1]去掉维度为1的实际就是3*3的池化
norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm1')
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# conv2
with tf.variable_scope('conv2') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 96, 64],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[64],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
conv = tf.nn.conv2d(norm1, weights, strides=[1, 1, 1, 1], padding='SAME')
conv2 = tf.nn.relu(pre_activation, name='conv2')

# pool2 and norm2
with tf.variable_scope('pooling2_lrn') as scope:
norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm2')
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1],
# 输出 size = BATCH_SIZE * 32 * 32 * 64
# local3 = 全连接1
with tf.variable_scope('local3') as scope:
reshape = tf.reshape(pool2, shape=[BATCH_SIZE, -1])
dim = reshape.get_shape()[1].value
# reshape的size为（BATCH_SIZE ，1024）,  reshape.get_shape() = (BATCH_SIZE ，1024)
# 而后面[1].value是第一列的值即1024
# 所以 dim = 1024
weights = tf.get_variable('weights',
shape=[dim, 384],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[384],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
# 输出 size = BATCH_SIZE * 384

# local4 = = 全连接2
with tf.variable_scope('local4') as scope:
weights = tf.get_variable('weights',
shape=[384, 192],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[192],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name='local4')
# 输出 size = BATCH_SIZE * 192

# softmax,输出为10
with tf.variable_scope('softmax_linear') as scope:
weights = tf.get_variable('softmax_linear',
shape=[192, 10],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[10],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
softmax_linear = tf.add(tf.matmul(local4, weights), biases, name='softmax_linear')
# 输出 size = BATCH_SIZE * 10
return softmax_linear

4.loss求解=损失函数

def losses(logits, labels):
with tf.variable_scope('loss') as scope:
labels = tf.cast(labels, tf.int64)

# to use this loss fuction, one-hot encoding is needed!
# 用了one-hot，就只能用下面这个损失函数，而不能用被注释掉的那个
cross_entropy = tf.nn.softmax_cross_entropy_with_logits \
(logits=logits, labels=labels, name='xentropy_per_example')
#   这里的‘\’换行

#        这个损失函数不需要用one-hot,可以节约一些时间，推荐
#        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
#                        (logits=logits, labels=labels, name='xentropy_per_example')

loss = tf.reduce_mean(cross_entropy, name='loss')
tf.summary.scalar(scope.name + '/loss', loss)

return loss

5.训练，也就是网络的反向传播，选用的是SGD梯度下降，也就是随机梯度下降。

# 开始训练，这里需要填入自己的数据地址
def train():
my_global_step = tf.Variable(0, name='global_step', trainable=False)

data_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
# 数据保存地址
log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'

is_train=True,
batch_size=BATCH_SIZE,
shuffle=True)
# input_data是数据分批的.py文件名，根据自己的来
logits = inference(images)  # logits是softmax后的值，也是我们的预测值

loss = losses(logits, labels)

train_op = optimizer.minimize(loss, global_step=my_global_step)

saver = tf.train.Saver(tf.global_variables())
summary_op = tf.summary.merge_all()

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

coord = tf.train.Coordinator()

summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

try:
for step in np.arange(MAX_STEP):
if coord.should_stop():
break
_, loss_value = sess.run([train_op, loss])

if step % 50 == 0:
print('Step: %d, loss: %.4f' % (step, loss_value))

if step % 100 == 0:
summary_str = sess.run(summary_op)

if step % 2000 == 0 or (step + 1) == MAX_STEP:
checkpoint_path = os.path.join(log_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)

except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()

sess.close()

6.评估，用测试集测试训练的代码的正确率。

def evaluate():
with tf.Graph().as_default():

log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
# 测试数据集存放地址
test_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
n_test = 10000

is_train=False,
batch_size=BATCH_SIZE,
shuffle=False)

logits = inference(images)  # 预测值，没用one-hot编码的
top_k_op = tf.nn.in_top_k(logits, labels, 1)   # 得到一个bool列表，预测与标签对应为true，否则为false
saver = tf.train.Saver(tf.global_variables())

with tf.Session() as sess:
# 恢复模型
ckpt = tf.train.get_checkpoint_state(log_dir)
if ckpt and ckpt.model_checkpoint_path:
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
# 这里取得step的步数
saver.restore(sess, ckpt.model_checkpoint_path)
else:
print('No checkpoint file found')
return

coord = tf.train.Coordinator()

try:
num_iter = int(math.ceil(n_test / BATCH_SIZE))  # 取整
true_count = 0
total_sample_count = num_iter * BATCH_SIZE
step = 0

while step < num_iter and not coord.should_stop():
predictions = sess.run([top_k_op])
true_count += np.sum(predictions)
step += 1
precision = true_count / total_sample_count
# 预测值 = 预测为true的图像除以总共的预测图像
print('precision = %.3f' % precision)
except Exception as e:
coord.request_stop(e)
finally:
coord.request_stop()
coord.join(threads)

if __name__=='__main__':
train()
# 迭代了10万次，用了2天时间，实际上在4万近5万次的时候loss就为0了。不用训练这么多次
evaluate()

7.错误分析

TypeError: Value passed to parameter 'targets' has DataType float32 not in list of allowed values: int32, int64

top_k_op = tf.nn.in_top_k(logits, labels, 1)

'''

# How to run?
# 0. you need to change the data directory（目录）
# 1. run cifar10-main.py
# 2. call train() in the console to train the model
# 3. call evaluate() in the console to test on the test data

'''

# 导入模型
import os
import os.path
import math
import numpy as np
import tensorflow as tf
import input_data # 之前写的数据分批

# 设置一些超参数：batch大小，学习速率以及最大迭代次数
BATCH_SIZE = 128
learning_rate = 0.05
MAX_STEP = 100000
# with this setting, it took less than 30 mins on my laptop to train.越大训练越久，先可以训练小一点如10000
# tensorflow一样迭代100k次，可能会很久
'''

'''
def inference(images):
'''
Args:
images: 4D tensor [batch_size, img_width, img_height, img_channel]
Notes:
In each conv layer, the kernel size is:
[kernel_size, kernel_size, number of input channels, number of output channels].
number of input channels are from previuous layer, if previous layer is THE input
layer, number of input channels should be image's channels.
'''
# conv1, [5, 5, 3, 96], The first two dimensions are the patch size,
# the next is the number of input channels,
# the last is the number of output channels
# 输入image size = [batch_size,32,32,3]
with tf.variable_scope('conv1') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 3, 96],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
# 这里用的标注差为0.05，同tensorflow官网一样
biases = tf.get_variable('biases',
shape=[96],
dtype=tf.float32,
initializer=tf.constant_initializer(0.0))
conv = tf.nn.conv2d(images, weights, strides=[1, 1, 1, 1], padding='SAME')
conv1 = tf.nn.relu(pre_activation, name=scope.name)  # scope.name = conv1
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# pool1 and norm1
with tf.variable_scope('pooling1_lrn') as scope:
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
#  ksize=[1, 3, 3, 1]去掉维度为1的实际就是3*3的池化
norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm1')
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# conv2
with tf.variable_scope('conv2') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 96, 64],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[64],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
conv = tf.nn.conv2d(norm1, weights, strides=[1, 1, 1, 1], padding='SAME')
conv2 = tf.nn.relu(pre_activation, name='conv2')

# pool2 and norm2
with tf.variable_scope('pooling2_lrn') as scope:
norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm2')
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1],
# 输出 size = BATCH_SIZE * 32 * 32 * 64
# local3 = 全连接1
with tf.variable_scope('local3') as scope:
reshape = tf.reshape(pool2, shape=[BATCH_SIZE, -1])
dim = reshape.get_shape()[1].value
# reshape的size为（BATCH_SIZE ，1024）,  reshape.get_shape() = (BATCH_SIZE ，1024)
# 而后面[1].value是第一列的值即1024
# 所以 dim = 1024
weights = tf.get_variable('weights',
shape=[dim, 384],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[384],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
# 输出 size = BATCH_SIZE * 384

# local4 = = 全连接2
with tf.variable_scope('local4') as scope:
weights = tf.get_variable('weights',
shape=[384, 192],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[192],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name='local4')
# 输出 size = BATCH_SIZE * 192

# softmax,输出为10
with tf.variable_scope('softmax_linear') as scope:
weights = tf.get_variable('softmax_linear',
shape=[192, 10],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[10],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
softmax_linear = tf.add(tf.matmul(local4, weights), biases, name='softmax_linear')
# 输出 size = BATCH_SIZE * 10
return softmax_linear

# 计算cost
def losses(logits, labels):
with tf.variable_scope('loss') as scope:
labels = tf.cast(labels, tf.int64)

# to use this loss fuction, one-hot encoding is needed!
# 用了one-hot，就只能用下面这个损失函数，而不能用被注释掉的那个
cross_entropy = tf.nn.softmax_cross_entropy_with_logits \
(logits=logits, labels=labels, name='xentropy_per_example')
#   这里的‘\’换行

#        这个损失函数不需要用one-hot,可以节约一些时间，推荐
#        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
#                        (logits=logits, labels=labels, name='xentropy_per_example')

loss = tf.reduce_mean(cross_entropy, name='loss')
tf.summary.scalar(scope.name + '/loss', loss)

return loss

# 开始训练，这里需要填入自己的数据地址
def train():
my_global_step = tf.Variable(0, name='global_step', trainable=False)

data_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
# 数据保存地址
log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'

is_train=True,
batch_size=BATCH_SIZE,
shuffle=True)
# input_data是数据分批的.py文件名，根据自己的来
logits = inference(images)  # logits是softmax后的值，也是我们的预测值

loss = losses(logits, labels)

train_op = optimizer.minimize(loss, global_step=my_global_step)

saver = tf.train.Saver(tf.global_variables())
summary_op = tf.summary.merge_all()

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

coord = tf.train.Coordinator()

summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

try:
for step in np.arange(MAX_STEP):
if coord.should_stop():
break
_, loss_value = sess.run([train_op, loss])

if step % 50 == 0:
print('Step: %d, loss: %.4f' % (step, loss_value))

if step % 100 == 0:
summary_str = sess.run(summary_op)

if step % 2000 == 0 or (step + 1) == MAX_STEP:
checkpoint_path = os.path.join(log_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)

except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()

sess.close()

# 评估，注意改地址

def evaluate():
with tf.Graph().as_default():

log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
# 测试数据集存放地址
test_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
n_test = 10000

is_train=False,
batch_size=BATCH_SIZE,
shuffle=False)

logits = inference(images)  # 预测值，没用one-hot编码的
top_k_op = tf.nn.in_top_k(logits, labels, 1)   # 得到一个bool列表，预测与标签对应为true，否则为false
saver = tf.train.Saver(tf.global_variables())

with tf.Session() as sess:
# 恢复模型
ckpt = tf.train.get_checkpoint_state(log_dir)
if ckpt and ckpt.model_checkpoint_path:
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
# 这里取得step的步数
saver.restore(sess, ckpt.model_checkpoint_path)
else:
print('No checkpoint file found')
return

coord = tf.train.Coordinator()

try:
num_iter = int(math.ceil(n_test / BATCH_SIZE))  # 取整
true_count = 0
total_sample_count = num_iter * BATCH_SIZE
step = 0

while step < num_iter and not coord.should_stop():
predictions = sess.run([top_k_op])
true_count += np.sum(predictions)
step += 1
precision = true_count / total_sample_count
# 预测值 = 预测为true的图像除以总共的预测图像
print('precision = %.3f' % precision)
except Exception as e:
coord.request_stop(e)
finally:
coord.request_stop()

if __name__=='__main__':
# train()
# 迭代了10万次，用了2天时间，实际上在4万近5万次的时候loss就为0了。不用训练这么多次
evaluate()