本教程参考Kevin的视频教程:Youtube
数据集下载地址为:点击打开链接
Kevin知乎:点击打开链接
在上一部分 https://blog.csdn.net/u014264373/article/details/79960869 已经讲了数据的处理,接下来就是训练以及评估。
1. 这部分代码的使用方法,如果你不想知道怎么写的,可以直接运行。文尾附上全部代码
# 0. you need to change the data directory(目录),文中所有涉及地址的地方你都要改成自己的
# 1. run cifar10-MAIN.py
# 2. call train() in the console to train the model
# 3. call evaluate() in the console to test on the test data
-----------------------------------------------代码分解---------------------------------------------------
1.导入包
import os
import os.path
import math
import numpy as np
import tensorflow as tf
import input_data # 之前写的数据处理
2.设置一些超参数
# 设置一些超参数:batch大小,学习速率以及最大迭代次数
BATCH_SIZE = 128
learning_rate = 0.05
MAX_STEP = 100000
# with this setting, it took less than 30 mins on my laptop to train.越大训练越久,先可以训练小一点如10000
# tensorflow一样迭代100k次,可能会很久
接下来网络中的一些参数初始化,如W,b、包括各层的卷积,激活,池化等,包含整个模型框架。这个框架同CATS VS DOGS框架一样。整个卷积过程就是卷积》池化》归一化》卷积》归一化》池化》全连接》全连接》softmax
对于先是正则化还是池化还没有领悟,你知道吗?
3.网络架构
def inference(images):
'''
Args:
images: 4D tensor [batch_size, img_width, img_height, img_channel]
Notes:
In each conv layer, the kernel size is:
[kernel_size, kernel_size, number of input channels, number of output channels].
number of input channels are from previuous layer, if previous layer is THE input
layer, number of input channels should be image's channels.
'''
# conv1, [5, 5, 3, 96], The first two dimensions are the patch size,
# the next is the number of input channels,
# the last is the number of output channels
# 输入image size = [batch_size,32,32,3]
with tf.variable_scope('conv1') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 3, 96],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
# 这里用的标注差为0.05,同tensorflow官网一样
biases = tf.get_variable('biases',
shape=[96],
dtype=tf.float32,
initializer=tf.constant_initializer(0.0))
conv = tf.nn.conv2d(images, weights, strides=[1, 1, 1, 1], padding='SAME')
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation, name=scope.name) # scope.name = conv1
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# pool1 and norm1
with tf.variable_scope('pooling1_lrn') as scope:
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pooling1')
# ksize=[1, 3, 3, 1]去掉维度为1的实际就是3*3的池化
norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm1')
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# conv2
with tf.variable_scope('conv2') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 96, 64],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[64],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
conv = tf.nn.conv2d(norm1, weights, strides=[1, 1, 1, 1], padding='SAME')
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation, name='conv2')
# pool2 and norm2
with tf.variable_scope('pooling2_lrn') as scope:
norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm2')
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1],
padding='SAME', name='pooling2')
# 输出 size = BATCH_SIZE * 32 * 32 * 64
# local3 = 全连接1
with tf.variable_scope('local3') as scope:
reshape = tf.reshape(pool2, shape=[BATCH_SIZE, -1])
dim = reshape.get_shape()[1].value
# reshape的size为(BATCH_SIZE ,1024), reshape.get_shape() = (BATCH_SIZE ,1024)
# 而后面[1].value是第一列的值即1024
# 所以 dim = 1024
weights = tf.get_variable('weights',
shape=[dim, 384],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[384],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
# 输出 size = BATCH_SIZE * 384
# local4 = = 全连接2
with tf.variable_scope('local4') as scope:
weights = tf.get_variable('weights',
shape=[384, 192],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[192],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name='local4')
# 输出 size = BATCH_SIZE * 192
# softmax,输出为10
with tf.variable_scope('softmax_linear') as scope:
weights = tf.get_variable('softmax_linear',
shape=[192, 10],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[10],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
softmax_linear = tf.add(tf.matmul(local4, weights), biases, name='softmax_linear')
# 输出 size = BATCH_SIZE * 10
return softmax_linear
4.loss求解=损失函数
def losses(logits, labels):
with tf.variable_scope('loss') as scope:
labels = tf.cast(labels, tf.int64)
# to use this loss fuction, one-hot encoding is needed!
# 用了one-hot,就只能用下面这个损失函数,而不能用被注释掉的那个
cross_entropy = tf.nn.softmax_cross_entropy_with_logits \
(logits=logits, labels=labels, name='xentropy_per_example')
# 这里的‘\’换行
# 这个损失函数不需要用one-hot,可以节约一些时间,推荐
# cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
# (logits=logits, labels=labels, name='xentropy_per_example')
loss = tf.reduce_mean(cross_entropy, name='loss')
tf.summary.scalar(scope.name + '/loss', loss)
return loss
这里需要注意的是loss函数的选择,要依据有没有用one-hot选择不同的函数,在代码里有注释,请合理选用。
5.训练,也就是网络的反向传播,选用的是SGD梯度下降,也就是随机梯度下降。
# 开始训练,这里需要填入自己的数据地址
def train():
my_global_step = tf.Variable(0, name='global_step', trainable=False)
data_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
# 数据保存地址
log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
images, labels = input_data.read_cifar10(data_dir=data_dir,
is_train=True,
batch_size=BATCH_SIZE,
shuffle=True)
# input_data是数据分批的.py文件名,根据自己的来
logits = inference(images) # logits是softmax后的值,也是我们的预测值
loss = losses(logits, labels)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss, global_step=my_global_step)
saver = tf.train.Saver(tf.global_variables())
summary_op = tf.summary.merge_all()
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
try:
for step in np.arange(MAX_STEP):
if coord.should_stop():
break
_, loss_value = sess.run([train_op, loss])
if step % 50 == 0:
print('Step: %d, loss: %.4f' % (step, loss_value))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 2000 == 0 or (step + 1) == MAX_STEP:
checkpoint_path = os.path.join(log_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)
except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
coord.join(threads)
sess.close()
6.评估,用测试集测试训练的代码的正确率。
def evaluate():
with tf.Graph().as_default():
log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
# 测试数据集存放地址
test_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
n_test = 10000
# reading test data
images, labels = input_data.read_cifar10(data_dir=test_dir,
is_train=False,
batch_size=BATCH_SIZE,
shuffle=False)
logits = inference(images) # 预测值,没用one-hot编码的
top_k_op = tf.nn.in_top_k(logits, labels, 1) # 得到一个bool列表,预测与标签对应为true,否则为false
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
# 恢复模型
print("Reading checkpoints...")
ckpt = tf.train.get_checkpoint_state(log_dir)
if ckpt and ckpt.model_checkpoint_path:
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
# 这里取得step的步数
saver.restore(sess, ckpt.model_checkpoint_path)
print('Loading success, global_step is %s' % global_step)
else:
print('No checkpoint file found')
return
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
num_iter = int(math.ceil(n_test / BATCH_SIZE)) # 取整
true_count = 0
total_sample_count = num_iter * BATCH_SIZE
step = 0
while step < num_iter and not coord.should_stop():
predictions = sess.run([top_k_op])
true_count += np.sum(predictions)
step += 1
precision = true_count / total_sample_count
# 预测值 = 预测为true的图像除以总共的预测图像
print('precision = %.3f' % precision)
except Exception as e:
coord.request_stop(e)
finally:
coord.request_stop()
coord.join(threads)
完了,最后可以调用train()和evaluate来训练和评估网络性能。
if __name__=='__main__':
train()
# 迭代了10万次,用了2天时间,实际上在4万近5万次的时候loss就为0了。不用训练这么多次
evaluate()
这里建议如果不想要数据计算的太准就用10k次就行了,如果计算机可以,可以尝试100k次。
附上我的测试结果,训练就不演示了。我的正确率为78.1%
7.错误分析
TypeError: Value passed to parameter 'targets' has DataType float32 not in list of allowed values: int32, int64
如果出现这个错误,是因为代码
top_k_op = tf.nn.in_top_k(logits, labels, 1)
这句造成的,这个函数tf.nn.in_top_k,不接受labels的one-hot格式,所以解决的方法很简单,把数据处理的代码改为不用one-hot处理的就好了。
感兴趣的还可以用tensorboard把我们的模型及结果可视化看看。
附全代码;
'''
这个部分主要是模型的搭建,训练以及评估
# How to run?
# 0. you need to change the data directory(目录)
# 1. run cifar10-main.py
# 2. call train() in the console to train the model
# 3. call evaluate() in the console to test on the test data
'''
# 导入模型
import os
import os.path
import math
import numpy as np
import tensorflow as tf
import input_data # 之前写的数据分批
# 设置一些超参数:batch大小,学习速率以及最大迭代次数
BATCH_SIZE = 128
learning_rate = 0.05
MAX_STEP = 100000
# with this setting, it took less than 30 mins on my laptop to train.越大训练越久,先可以训练小一点如10000
# tensorflow一样迭代100k次,可能会很久
'''
接下来网络中的一些参数初始化,如W,b、包括各层的卷积,激活,池化等,包含整个模型框架
这个框架同CATS VS DOGS框架一样
整个卷积过程就是卷积》池化》归一化》卷积》归一化》池化》全连接》全连接》softmax
对于先是正则化还是池化还没有领悟
'''
def inference(images):
'''
Args:
images: 4D tensor [batch_size, img_width, img_height, img_channel]
Notes:
In each conv layer, the kernel size is:
[kernel_size, kernel_size, number of input channels, number of output channels].
number of input channels are from previuous layer, if previous layer is THE input
layer, number of input channels should be image's channels.
'''
# conv1, [5, 5, 3, 96], The first two dimensions are the patch size,
# the next is the number of input channels,
# the last is the number of output channels
# 输入image size = [batch_size,32,32,3]
with tf.variable_scope('conv1') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 3, 96],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
# 这里用的标注差为0.05,同tensorflow官网一样
biases = tf.get_variable('biases',
shape=[96],
dtype=tf.float32,
initializer=tf.constant_initializer(0.0))
conv = tf.nn.conv2d(images, weights, strides=[1, 1, 1, 1], padding='SAME')
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation, name=scope.name) # scope.name = conv1
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# pool1 and norm1
with tf.variable_scope('pooling1_lrn') as scope:
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pooling1')
# ksize=[1, 3, 3, 1]去掉维度为1的实际就是3*3的池化
norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm1')
# 输出 size = BATCH_SIZE * 32 * 32 * 96
# conv2
with tf.variable_scope('conv2') as scope:
weights = tf.get_variable('weights',
shape=[3, 3, 96, 64],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[64],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
conv = tf.nn.conv2d(norm1, weights, strides=[1, 1, 1, 1], padding='SAME')
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation, name='conv2')
# pool2 and norm2
with tf.variable_scope('pooling2_lrn') as scope:
norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
beta=0.75, name='norm2')
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1],
padding='SAME', name='pooling2')
# 输出 size = BATCH_SIZE * 32 * 32 * 64
# local3 = 全连接1
with tf.variable_scope('local3') as scope:
reshape = tf.reshape(pool2, shape=[BATCH_SIZE, -1])
dim = reshape.get_shape()[1].value
# reshape的size为(BATCH_SIZE ,1024), reshape.get_shape() = (BATCH_SIZE ,1024)
# 而后面[1].value是第一列的值即1024
# 所以 dim = 1024
weights = tf.get_variable('weights',
shape=[dim, 384],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[384],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
# 输出 size = BATCH_SIZE * 384
# local4 = = 全连接2
with tf.variable_scope('local4') as scope:
weights = tf.get_variable('weights',
shape=[384, 192],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[192],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name='local4')
# 输出 size = BATCH_SIZE * 192
# softmax,输出为10
with tf.variable_scope('softmax_linear') as scope:
weights = tf.get_variable('softmax_linear',
shape=[192, 10],
dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
biases = tf.get_variable('biases',
shape=[10],
dtype=tf.float32,
initializer=tf.constant_initializer(0.1))
softmax_linear = tf.add(tf.matmul(local4, weights), biases, name='softmax_linear')
# 输出 size = BATCH_SIZE * 10
return softmax_linear
# 计算cost
def losses(logits, labels):
with tf.variable_scope('loss') as scope:
labels = tf.cast(labels, tf.int64)
# to use this loss fuction, one-hot encoding is needed!
# 用了one-hot,就只能用下面这个损失函数,而不能用被注释掉的那个
cross_entropy = tf.nn.softmax_cross_entropy_with_logits \
(logits=logits, labels=labels, name='xentropy_per_example')
# 这里的‘\’换行
# 这个损失函数不需要用one-hot,可以节约一些时间,推荐
# cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
# (logits=logits, labels=labels, name='xentropy_per_example')
loss = tf.reduce_mean(cross_entropy, name='loss')
tf.summary.scalar(scope.name + '/loss', loss)
return loss
# 开始训练,这里需要填入自己的数据地址
def train():
my_global_step = tf.Variable(0, name='global_step', trainable=False)
data_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
# 数据保存地址
log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
images, labels = input_data.read_cifar10(data_dir=data_dir,
is_train=True,
batch_size=BATCH_SIZE,
shuffle=True)
# input_data是数据分批的.py文件名,根据自己的来
logits = inference(images) # logits是softmax后的值,也是我们的预测值
loss = losses(logits, labels)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss, global_step=my_global_step)
saver = tf.train.Saver(tf.global_variables())
summary_op = tf.summary.merge_all()
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
try:
for step in np.arange(MAX_STEP):
if coord.should_stop():
break
_, loss_value = sess.run([train_op, loss])
if step % 50 == 0:
print('Step: %d, loss: %.4f' % (step, loss_value))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 2000 == 0 or (step + 1) == MAX_STEP:
checkpoint_path = os.path.join(log_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)
except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
coord.join(threads)
sess.close()
# 评估,注意改地址
def evaluate():
with tf.Graph().as_default():
log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
# 测试数据集存放地址
test_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
n_test = 10000
# reading test data
images, labels = input_data.read_cifar10(data_dir=test_dir,
is_train=False,
batch_size=BATCH_SIZE,
shuffle=False)
logits = inference(images) # 预测值,没用one-hot编码的
top_k_op = tf.nn.in_top_k(logits, labels, 1) # 得到一个bool列表,预测与标签对应为true,否则为false
saver = tf.train.Saver(tf.global_variables())
with tf.Session() as sess:
# 恢复模型
print("Reading checkpoints...")
ckpt = tf.train.get_checkpoint_state(log_dir)
if ckpt and ckpt.model_checkpoint_path:
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
# 这里取得step的步数
saver.restore(sess, ckpt.model_checkpoint_path)
print('Loading success, global_step is %s' % global_step)
else:
print('No checkpoint file found')
return
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
num_iter = int(math.ceil(n_test / BATCH_SIZE)) # 取整
true_count = 0
total_sample_count = num_iter * BATCH_SIZE
step = 0
while step < num_iter and not coord.should_stop():
predictions = sess.run([top_k_op])
true_count += np.sum(predictions)
step += 1
precision = true_count / total_sample_count
# 预测值 = 预测为true的图像除以总共的预测图像
print('precision = %.3f' % precision)
except Exception as e:
coord.request_stop(e)
finally:
coord.request_stop()
coord.join(threads)
if __name__=='__main__':
# train()
# 迭代了10万次,用了2天时间,实际上在4万近5万次的时候loss就为0了。不用训练这么多次
evaluate()