【深度学习笔记2.2.2】AlexNet训练mnist

最新推荐文章于 2023-05-22 20:35:04 发布

取取经

最新推荐文章于 2023-05-22 20:35:04 发布

阅读量767

点赞数

分类专栏：深度学习笔记

本文链接：https://blog.csdn.net/yahstudio/article/details/87539009

版权

深度学习笔记专栏收录该内容

12 篇文章 0 订阅

订阅专栏

实验1：AlexNet Tensorflow 实现

代码示例如下(详见文献[2]AlexNet1.py)：

import numpy as np
import cv2
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt

datapath = '/home/pathto/res/MNIST_data'
mnist_data_set = input_data.read_data_sets(datapath, validation_size=0, one_hot=True)


def image_shape_scale(batch_xs):
    images = np.reshape(batch_xs, [batch_xs.shape[0], 28, 28])
    imlist = []
    [imlist.append(cv2.resize(img, (227, 227))) for img in images]
    images = np.array(imlist)
    # cv2.imwrite('scale1.jpg', images[0]*200)
    # cv2.imwrite('scale2.jpg', images[1]*200)
    # batch_xs = np.reshape(images, [batch_xs.shape[0], 227 * 227 * input_image_channel])
    batch_xs = np.reshape(images, [batch_xs.shape[0], 227, 227, input_image_channel])
    return batch_xs


input_image_channel = 1
learning_rate = 1e-4
training_epoch = 50
batch_size = 200
n_classes = 10
n_fc1 = 6*6*256
n_fc2 = 4096
n_fc3 = 4096
dropout_rate = 0.5

X = tf.placeholder(tf.float32, [None, 227, 227, input_image_channel])
y = tf.placeholder(tf.float32, [None, n_classes])


W_conv = {
    'conv1': tf.Variable(tf.truncated_normal([11, 11, input_image_channel, 96])),
    'conv2': tf.Variable(tf.truncated_normal([5, 5, 96, 256])),
    'conv3': tf.Variable(tf.truncated_normal([3, 3, 256, 384])),
    'conv4': tf.Variable(tf.truncated_normal([3, 3, 384, 384])),
    'conv5': tf.Variable(tf.truncated_normal([3, 3, 384, 256])),
    'fc1': tf.Variable(tf.truncated_normal([n_fc1, n_fc2])),
    'fc2': tf.Variable(tf.truncated_normal([n_fc2, n_fc3])),
    'output': tf.Variable(tf.truncated_normal([n_fc3, n_classes]))
}

b_conv = {
    'conv1': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[96])),
    'conv2': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[256])),
    'conv3': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[384])),
    'conv4': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[384])),
    'conv5': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[256])),
    'fc1': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[n_fc2])),
    'fc2': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[n_fc3])),
    'output': tf.Variable(tf.constant(0.1, dtype=tf.float32, shape=[n_classes]))
}

X_image = tf.reshape(X, [-1, 227, 227, input_image_channel])

# 卷积层1
conv1 = tf.nn.conv2d(X_image, W_conv['conv1'], strides=[1, 4, 4, 1], padding='VALID')
conv1 = tf.nn.bias_add(conv1, b_conv['conv1'])
conv1 = tf.nn.relu(conv1)
conv1 = tf.nn.local_response_normalization(conv1, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0)
# 此时 conv1.shape = [-1, 55, 55, 96]

# 池化层1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID')
# pool1.shape = [-1, 27, 27, 96]

# 卷积层2
conv2 = tf.nn.conv2d(pool1, W_conv['conv2'], strides=[1, 1, 1, 1], padding='SAME')
conv2 = tf.nn.bias_add(conv2, b_conv['conv2'])
conv2 = tf.nn.relu(conv2)
conv2 = tf.nn.local_response_normalization(conv2, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0)
# 此时 conv2.shape = [-1, 27, 27, 256]

# 池化层2
pool2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID')
# 此时 pool2.shape = [-1, 13, 13, 256]

# 卷积层3
conv3 = tf.nn.conv2d(pool2, W_conv['conv3'], strides=[1, 1, 1, 1], padding='SAME')
conv3 = tf.nn.bias_add(conv3, b_conv['conv3'])
conv3 = tf.nn.relu(conv3)
# 此时 conv3.shape = [-1, 13, 13, 384]

# 卷积层4
conv4 = tf.nn.conv2d(conv3, W_conv['conv4'], strides=[1, 1, 1, 1], padding='SAME')
conv4 = tf.nn.bias_add(conv4, b_conv['conv4'])
conv4 = tf.nn.relu(conv4)
# 此时 conv4.shape = [-1, 13, 13, 384]

# 卷积层5
conv5 = tf.nn.conv2d(conv4, W_conv['conv5'], strides=[1, 1, 1, 1], padding='SAME')
conv5 = tf.nn.bias_add(conv5, b_conv['conv5'])
conv5 = tf.nn.relu(conv5)
# 此时 conv5.shape = [-1, 13, 13, 256]

# 池化层5
pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID')
# 此时pool5.shape = [-1, 6, 6, 256]

# 全连接层1
reshape = tf.reshape(pool5, [-1, n_fc1])
# 此时reshape.shape = [-1, 9216]
fc1 = tf.add(tf.matmul(reshape, W_conv['fc1']), b_conv['fc1'])
fc1 = tf.nn.relu(fc1)
fc1 = tf.nn.dropout(fc1, dropout_rate)
# 此时fc1.shape = [-1, 4096]

# 全连接层2
fc2 = tf.add(tf.matmul(fc1, W_conv['fc2']), b_conv['fc2'])
fc2 = tf.nn.relu(fc2)
fc2 = tf.nn.dropout(fc2, dropout_rate)
# 此时fc2.shape = [-1, 4096]

# 输出层
output = tf.add(tf.matmul(fc2, W_conv['output']), b_conv['output'])
# 此时output.shape = [-1. 10]

# 定义交叉熵损失函数（有两种方法）：
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 方法1： 自己实现交叉熵
y_output = tf.nn.softmax(output)  # 对网络最后一层的输出做softmax, 这通常是求取输出属于某一类的概率
cross_entropy = -tf.reduce_sum(y * tf.log(y_output))  # 用softmax的输出向量和样本的实际标签做一个交叉熵.
loss = tf.reduce_mean(cross_entropy)  # 对交叉熵求均值就是loss
# loss = -tf.reduce_mean(y * tf.log(y_output))  # 交叉熵本应是一个向量，但tf.reduce_mean可以直接求取tensor所有维度的和，所以这里可以用tf.reduce_mean一句代替上述三步。

# 方法2：使用tensorflow自带的tf.nn.softmax_cross_entropy_with_logits函数实现交叉熵
# loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

# 评估模型
correct_pred = tf.equal(tf.argmax(y_output, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()

loss_buf = []
accuracy_buf = []
with tf.device("/gpu:0"):
    # with tf.Graph().as_default():
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0)
    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=True)
    # config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    with tf.Session(config=config) as sess:
        sess.run(init)

        total_batch = mnist_data_set.train.num_examples // batch_size
        for i in range(training_epoch):
            for iteration in range(total_batch):
                batch_xs, batch_ys = mnist_data_set.train.next_batch(batch_size)
                batch_xs = image_shape_scale(batch_xs)

                sess.run(train_step, feed_dict={X: batch_xs, y: batch_ys})
                test_accuracy = sess.run(accuracy, feed_dict={X: batch_xs, y: batch_ys})
                print("step {}, iteration {}, training accuracy {}".format(i, iteration, test_accuracy))

            batch_xs, batch_ys = mnist_data_set.test.images[0:1000, :], mnist_data_set.test.labels[0:1000, :]
            batch_xs = image_shape_scale(batch_xs)

            loss_val = sess.run(loss, feed_dict={X: batch_xs, y: batch_ys})
            test_accuracy = sess.run(accuracy, feed_dict={X: batch_xs, y: batch_ys})

            loss_buf.append(loss_val)
            accuracy_buf.append(test_accuracy)
            print("step {}, loss {}, testing accuracy {}".format(i, loss_val, test_accuracy))


# 画出准确率曲线
accuracy_ndarray = np.array(accuracy_buf)
accuracy_size = np.arange(len(accuracy_ndarray))
plt.plot(accuracy_size, accuracy_ndarray, 'b+', label='accuracy')

loss_ndarray = np.array(loss_buf)
loss_size = np.arange(len(loss_ndarray))
plt.plot(loss_size, loss_ndarray, 'r*', label='loss')

plt.show()

# 保存loss和测试准确率到csv文件
with open('AlexNet.csv', 'w') as fid:
    for loss, acc in zip(loss_buf, accuracy_buf):
        strText = str(loss) + ',' + str(acc) + '\n'
        fid.write(strText)
fid.close()

print('end')

训练步骤打印结果如下：
step 0, loss nan, testing accuracy 0.08500000089406967
step 1, loss nan, testing accuracy 0.08500000089406967
step 2, loss nan, testing accuracy 0.08500000089406967
step 3, loss nan, testing accuracy 0.08500000089406967
… …
可以看到，模型无法收敛。

实验2：通过改变权重初始化方法进行优化

后参考文献[1]代码，排查到模型无法收敛的原因可能是weights、biases的初始化不当。上述代码中，tf.truncated_normal默认使用均值mean为0、标准差stddev为1的截断正态分布来初始化W_conv和b_conv。
这里我们可以对上述代码AlexNet1.py做如下修改(其他不变，此部分完整代码详见文献[2]AlexNet2.py)：

W_conv = {
    'conv1': tf.Variable(tf.truncated_normal([11, 11, input_image_channel, 96], mean=0, stddev=0.01)),
    'conv2': tf.Variable(tf.truncated_normal([5, 5, 96, 256], mean=0, stddev=0.01)),
    'conv3': tf.Variable(tf.truncated_normal([3, 3, 256, 384], mean=0, stddev=0.01)),
    'conv4': tf.Variable(tf.truncated_normal([3, 3, 384, 384], mean=0, stddev=0.01)),
    'conv5': tf.Variable(tf.truncated_normal([3, 3, 384, 256], mean=0, stddev=0.01)),
    'fc1': tf.Variable(tf.truncated_normal([n_fc1, n_fc2], mean=0, stddev=0.01)),
    'fc2': tf.Variable(tf.truncated_normal([n_fc2, n_fc3], mean=0, stddev=0.01)),
    'output': tf.Variable(tf.truncated_normal([n_fc3, n_classes], mean=0, stddev=0.01))
}

b_conv = {
    'conv1': tf.Variable(tf.truncated_normal([96], mean=0.005, stddev=0.1)),
    'conv2': tf.Variable(tf.truncated_normal([256], mean=0.005, stddev=0.1)),
    'conv3': tf.Variable(tf.truncated_normal([384], mean=0.005, stddev=0.1)),
    'conv4': tf.Variable(tf.truncated_normal([384], mean=0.005, stddev=0.1)),
    'conv5': tf.Variable(tf.truncated_normal([256], mean=0.005, stddev=0.1)),
    'fc1': tf.Variable(tf.truncated_normal([n_fc2], mean=0.005, stddev=0.1)),
    'fc2': tf.Variable(tf.truncated_normal([n_fc3], mean=0.005, stddev=0.1)),
    'output': tf.Variable(tf.truncated_normal([n_classes], mean=0.005, stddev=0.1))
}

训练步骤打印结果如下：
step 1, loss 2299.4792, testing accuracy 0.122
step 2, loss 2299.8113, testing accuracy 0.117
… …
step 11, loss 2293.2812, testing accuracy 0.125
step 12, loss 1955.1497, testing accuracy 0.391
step 13, loss 353.29596, testing accuracy 0.866
step 14, loss 146.65419, testing accuracy 0.954
… …
step 44, loss 36.49175, testing accuracy 0.99
step 45, loss 22.933325, testing accuracy 0.986
step 46, loss 35.3011, testing accuracy 0.99
step 47, loss nan, testing accuracy 0.085
step 48, loss nan, testing accuracy 0.085
step 49, loss nan, testing accuracy 0.085
step 50, loss nan, testing accuracy 0.085
可以看到，此时算法模型可以正常收敛，但在中途会突然梯度爆炸。

实验3：中途调低学习率避免梯度爆炸

为了解决梯度爆炸，我们可以在梯度下降接近低谷附近时调低学习率。这里需要继续在上述代码AlexNet2.py的基础上做些改进，主要是将学习率设为占位符变量，在训练的过程中动态设置学习率。主要改进如下(此部分完整代码详见文献[2]AlexNet3.py)：

learning_rate_holder = tf.placeholder(tf.float32)
train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate_holder).minimize(loss)

... ...

with tf.Session(config=config) as sess:
    sess.run(init)
    total_batch = mnist_data_set.train.num_examples // batch_size
    for i in range(training_epoch):
        for iteration in range(total_batch):
            ... ...

            if i < 30:
                sess.run(train_step, feed_dict={X: batch_xs, y: batch_ys, learning_rate_holder: learning_rate})
            elif i < 50:
                sess.run(train_step, feed_dict={X: batch_xs, y: batch_ys, learning_rate_holder: learning_rate / 10.0})
            elif i < 70:
                sess.run(train_step, feed_dict={X: batch_xs, y: batch_ys, learning_rate_holder: learning_rate / 100.0})
            else:
                sess.run(train_step, feed_dict={X: batch_xs, y: batch_ys, learning_rate_holder: learning_rate / 1000.0})

训练步骤打印结果如下：
step 1, loss 2300.436, testing accuracy 0.117
step 2, loss 2299.375, testing accuracy 0.13
… …
step 10, loss 2290.4155, testing accuracy 0.154
step 11, loss 678.09644, testing accuracy 0.788
step 12, loss 178.35306, testing accuracy 0.948
… …
step 49, loss 35.42096, testing accuracy 0.988
step 50, loss 34.458153, testing accuracy 0.988
梯度爆炸问题得以解决。

实验4：使用tf.get_variable创建变量，使用tf.global_variables_initializer初始化

上面代码都是使用 tf.Variable(tf.truncated_normal(…)) 来创建权重矩阵的，我们现在尝试使用 tf.get_variable() 来创建权重矩阵。tf.Variable 和 tf.get_variable的区别是前者每次调用都会创建新的对象；而对于后者来说，如果变量已经存在则直接将该变量返回，否则它才会创建一个新的变量 [3]。

在AlexNet1.py的基础上做如下改进，其他不变：(此部分完整代码参见文献[2]AlexNet4.py)

W_conv = {
    'conv1': tf.get_variable('conv1/weights', shape=[11, 11, input_image_channel, 96]),
    'conv2': tf.get_variable('conv2/weights', shape=[5, 5, 96, 256]),
    'conv3': tf.get_variable('conv3/weights', shape=[3, 3, 256, 384]),
    'conv4': tf.get_variable('conv4/weights', shape=[3, 3, 384, 384]),
    'conv5': tf.get_variable('conv5/weights', shape=[3, 3, 384, 256]),
    'fc1': tf.get_variable('fc1/weights', shape=[n_fc1, n_fc2], trainable=True),
    'fc2': tf.get_variable('fc2/weights', shape=[n_fc2, n_fc3], trainable=True),
    'output': tf.get_variable('output/weights', shape=[n_fc3, n_classes], trainable=True)
}

b_conv = {
    'conv1': tf.get_variable('conv1/biases', shape=[96]),
    'conv2': tf.get_variable('conv2/biases', shape=[256]),
    'conv3': tf.get_variable('conv3/biases', shape=[384]),
    'conv4': tf.get_variable('conv4/biases', shape=[384]),
    'conv5': tf.get_variable('conv5/biases', shape=[256]),
    'fc1': tf.get_variable('fc1/biases', shape=[n_fc2], trainable=True),
    'fc2': tf.get_variable('fc2/biases', shape=[n_fc3], trainable=True),
    'output': tf.get_variable('output/biases', shape=[n_classes], trainable=True)
}

训练步骤打印结果如下：
step 1, loss 170.07048, testing accuracy 0.936
step 2, loss 86.70665, testing accuracy 0.972
… …
step 50, loss 26.004112, testing accuracy 0.99
可以看到，算法模型step1时测试准确率就达到93.6%了，且后面没有梯度爆炸。

在这里，由于我们并没有事先为权重创建变量，所以tf.get_variable会自己创建变量，然后使用tf.global_variables_initializer()初始化所有的变量，可见tf.get_variable和tf.global_variables_initializer()使用了一种较好的初始化策略。

本实验说明了良好的权重初始化对算法模型的训练是非常重要的。

实验5：对卷积层分组

在AlexNet2.py的基础上进行改进，将 W_conv 中的conv4、conv5分成两组，具体改进如下：（代码详见文献[2]AlexNet5.py）

W_conv = {
    ... ...
    'conv4_1': tf.Variable(tf.truncated_normal([3, 3, 384//2, 384//2], mean=0, stddev=0.01)),
    'conv4_2': tf.Variable(tf.truncated_normal([3, 3, 384//2, 384//2], mean=0, stddev=0.01)),
    'conv5_1': tf.Variable(tf.truncated_normal([3, 3, 384//2, 256//2], mean=0, stddev=0.01)),
    'conv5_2': tf.Variable(tf.truncated_normal([3, 3, 384//2, 256//2], mean=0, stddev=0.01)),
    ... ...
}

对卷积层4、卷积层5的具体改进如下：

# 卷积层4
conv3groups = tf.split(axis=3, num_or_size_splits=2, value=conv3)
conv4_1 = tf.nn.conv2d(conv3groups[0], W_conv['conv4_1'], strides=[1, 1, 1, 1], padding='SAME')
conv4_2 = tf.nn.conv2d(conv3groups[1], W_conv['conv4_2'], strides=[1, 1, 1, 1], padding='SAME')
conv4 = tf.concat(axis=3, values=[conv4_1, conv4_2])
conv4 = tf.nn.bias_add(conv4, b_conv['conv4'])
conv4 = tf.nn.relu(conv4)

# 卷积层5
conv4groups = tf.split(axis=3, num_or_size_splits=2, value=conv4)
conv5_1 = tf.nn.conv2d(conv4groups[0], W_conv['conv5_1'], strides=[1, 1, 1, 1], padding='SAME')
conv5_2 = tf.nn.conv2d(conv4groups[1], W_conv['conv5_2'], strides=[1, 1, 1, 1], padding='SAME')
conv5 = tf.concat(axis=3, values=[conv5_1, conv5_2])
conv5 = tf.nn.bias_add(conv5, b_conv['conv5'])
conv5 = tf.nn.relu(conv5)

其他代码不变，训练测试结果打印如下：
step 1, loss 2297.4663, testing accuracy 0.104
step 2, loss 2299.2734, testing accuracy 0.124
… …
step 3, loss 2114.1978, testing accuracy 0.301
step 4, loss 366.07996, testing accuracy 0.872
… …
step 5, loss 44.795494, testing accuracy 0.992
step 6, loss 37.983482, testing accuracy 0.988
和AlexNet2.py相比，此时算法梯度下降速度更快，且不会出现梯度爆炸。

实验6：使用Batch Normalization优化

在AlexNet2.py的基础上，使用Batch Normalization算法优化模型，并且去掉lrn，主要改进如下：（代码详见文献[2]AlexNet6.py）

def batch_norm(inputs, is_training, is_conv_out=True, decay=0.999):
    scale = tf.Variable(tf.ones([inputs.get_shape()[-1]]))
    beta = tf.Variable(tf.zeros([inputs.get_shape()[-1]]))
    pop_mean = tf.Variable(tf.zeros([inputs.get_shape()[-1]]), trainable=False)
    pop_var = tf.Variable(tf.ones([inputs.get_shape()[-1]]), trainable=False)

    if is_training:
        if is_conv_out:
            batch_mean, batch_var = tf.nn.moments(inputs, [0, 1, 2])
        else:
            batch_mean, batch_var = tf.nn.moments(inputs, [0])

        train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay))
        train_var = tf.assign(pop_var, pop_var*decay+batch_var*(1-decay))

        with tf.control_dependencies([train_mean, train_var]):
            return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, scale, 0.001)
    else:
        return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, scale, 0.001)


... ...
... ...


conv1 = tf.nn.bias_add(...)
conv1 = batch_norm(conv1, True)
... ...

conv2 = tf.nn.bias_add(...)
conv2 = batch_norm(conv2, True)
... ...

conv3 = tf.nn.bias_add(...)
conv3 = batch_norm(conv3, True)
... ...

conv4 = tf.nn.bias_add(...)
conv4 = batch_norm(conv4, True)
... ...

conv5 = tf.nn.bias_add(...)
conv5 = batch_norm(conv5, True)
... ...

fc1 = tf.add(...)
fc1 = batch_norm(fc1, True, False)
... ...

fc2 = tf.add(...)
fc2 = batch_norm(fc2, True, False)
... ...

训练测试结果打印如下：
step 1, loss 44.443394, testing accuracy 0.979
step 1, loss 34.873466, testing accuracy 0.988
… …
step 1, loss 13.521537, testing accuracy 0.992
step 1, loss 10.757448, testing accuracy 0.991
很明显，使用BN算法优化效果非常显著。

AlexNet6.py优化中，我们使用了tf.truncated_normal并且指定mean、stddev来创建参数矩阵，后来我实验发现，如果使用AlexNet1.py中创建参数矩阵的方法的话(即使用tf.truncated_normal创建参数矩阵但mean和stddev却使用默认的0和1，详见文献[2]AlexNet7.py)，则模型仍然是无法收敛的，这说明即使是使用BN优化，但不恰当的参数初始化仍然无法使模型收敛。

最后附上上述实验的loss和accuracy曲线图：
enter image description here

enter image description here

实验总结

合适的参数初始化是非常重要的；
动态调整学习率；
当我们不确定如何手动初始化参数矩阵时，可以使用 tf.get_variable + tf.global_variables_initializer 默认的初始化策略；
对卷积层分组是一个很好优化思路；
Batch Normalization算法优化效果非常显著。