Kevin Xu-TensorFlow Tutorials-cifar10 (2)

本教程参考Kevin的视频教程:Youtube

数据集下载地址为:点击打开链接

Kevin知乎:点击打开链接

在上一部分 https://blog.csdn.net/u014264373/article/details/79960869 已经讲了数据的处理,接下来就是训练以及评估。

1. 这部分代码的使用方法,如果你不想知道怎么写的,可以直接运行。文尾附上全部代码

# 0. you need to change the data directory(目录),文中所有涉及地址的地方你都要改成自己的
# 1. run cifar10-MAIN.py
# 2. call train() in the console to train the model
# 3. call evaluate() in the console to test on the test data

-----------------------------------------------代码分解---------------------------------------------------

1.导入包

import os
import os.path
import math
import numpy as np
import tensorflow as tf
import input_data # 之前写的数据处理

2.设置一些超参数

# 设置一些超参数:batch大小,学习速率以及最大迭代次数
BATCH_SIZE = 128
learning_rate = 0.05
MAX_STEP = 100000
# with this setting, it took less than 30 mins on my laptop to train.越大训练越久,先可以训练小一点如10000
# tensorflow一样迭代100k次,可能会很久

接下来网络中的一些参数初始化,如W,b、包括各层的卷积,激活,池化等,包含整个模型框架。这个框架同CATS VS DOGS框架一样。整个卷积过程就是卷积》池化》归一化》卷积》归一化》池化》全连接》全连接》softmax
对于先是正则化还是池化还没有领悟,你知道吗?

3.网络架构

def inference(images):
    '''
    Args:
        images: 4D tensor [batch_size, img_width, img_height, img_channel]
    Notes:
        In each conv layer, the kernel size is:
        [kernel_size, kernel_size, number of input channels, number of output channels].
        number of input channels are from previuous layer, if previous layer is THE input
        layer, number of input channels should be image's channels.
    '''
    # conv1, [5, 5, 3, 96], The first two dimensions are the patch size,
    # the next is the number of input channels,
    # the last is the number of output channels
    # 输入image size = [batch_size,32,32,3]
    with tf.variable_scope('conv1') as scope:
        weights = tf.get_variable('weights',
                                  shape=[3, 3, 3, 96],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
        # 这里用的标注差为0.05,同tensorflow官网一样
        biases = tf.get_variable('biases',
                                 shape=[96],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.0))
        conv = tf.nn.conv2d(images, weights, strides=[1, 1, 1, 1], padding='SAME')
        pre_activation = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(pre_activation, name=scope.name)  # scope.name = conv1
        # 输出 size = BATCH_SIZE * 32 * 32 * 96
    # pool1 and norm1
    with tf.variable_scope('pooling1_lrn') as scope:
        pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                               padding='SAME', name='pooling1')
        #  ksize=[1, 3, 3, 1]去掉维度为1的实际就是3*3的池化
        norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
                          beta=0.75, name='norm1')
    # 输出 size = BATCH_SIZE * 32 * 32 * 96
    # conv2
    with tf.variable_scope('conv2') as scope:
        weights = tf.get_variable('weights',
                                  shape=[3, 3, 96, 64],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[64],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        conv = tf.nn.conv2d(norm1, weights, strides=[1, 1, 1, 1], padding='SAME')
        pre_activation = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(pre_activation, name='conv2')

    # pool2 and norm2
    with tf.variable_scope('pooling2_lrn') as scope:
        norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
                          beta=0.75, name='norm2')
        pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1],
                               padding='SAME', name='pooling2')
    # 输出 size = BATCH_SIZE * 32 * 32 * 64
    # local3 = 全连接1
    with tf.variable_scope('local3') as scope:
        reshape = tf.reshape(pool2, shape=[BATCH_SIZE, -1])
        dim = reshape.get_shape()[1].value
        # reshape的size为(BATCH_SIZE ,1024),  reshape.get_shape() = (BATCH_SIZE ,1024)
        # 而后面[1].value是第一列的值即1024
        # 所以 dim = 1024
        weights = tf.get_variable('weights',
                                  shape=[dim, 384],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[384],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
    # 输出 size = BATCH_SIZE * 384

    # local4 = = 全连接2
    with tf.variable_scope('local4') as scope:
        weights = tf.get_variable('weights',
                                  shape=[384, 192],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[192],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name='local4')
    # 输出 size = BATCH_SIZE * 192

    # softmax,输出为10
    with tf.variable_scope('softmax_linear') as scope:
        weights = tf.get_variable('softmax_linear',
                                  shape=[192, 10],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[10],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        softmax_linear = tf.add(tf.matmul(local4, weights), biases, name='softmax_linear')
    # 输出 size = BATCH_SIZE * 10
    return softmax_linear

4.loss求解=损失函数

def losses(logits, labels):
    with tf.variable_scope('loss') as scope:
        labels = tf.cast(labels, tf.int64)

        # to use this loss fuction, one-hot encoding is needed!
        # 用了one-hot,就只能用下面这个损失函数,而不能用被注释掉的那个
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits \
            (logits=logits, labels=labels, name='xentropy_per_example')
        #   这里的‘\’换行

        #        这个损失函数不需要用one-hot,可以节约一些时间,推荐
        #        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
        #                        (logits=logits, labels=labels, name='xentropy_per_example')

        loss = tf.reduce_mean(cross_entropy, name='loss')
        tf.summary.scalar(scope.name + '/loss', loss)

    return loss

这里需要注意的是loss函数的选择,要依据有没有用one-hot选择不同的函数,在代码里有注释,请合理选用。

5.训练,也就是网络的反向传播,选用的是SGD梯度下降,也就是随机梯度下降。

# 开始训练,这里需要填入自己的数据地址
def train():
    my_global_step = tf.Variable(0, name='global_step', trainable=False)

    data_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
    # 数据保存地址
    log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'

    images, labels = input_data.read_cifar10(data_dir=data_dir,
                                                is_train=True,
                                                batch_size=BATCH_SIZE,
                                                shuffle=True)
    # input_data是数据分批的.py文件名,根据自己的来
    logits = inference(images)  # logits是softmax后的值,也是我们的预测值

    loss = losses(logits, labels)

    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.minimize(loss, global_step=my_global_step)

    saver = tf.train.Saver(tf.global_variables())
    summary_op = tf.summary.merge_all()

    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

    try:
        for step in np.arange(MAX_STEP):
            if coord.should_stop():
                break
            _, loss_value = sess.run([train_op, loss])

            if step % 50 == 0:
                print('Step: %d, loss: %.4f' % (step, loss_value))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 2000 == 0 or (step + 1) == MAX_STEP:
                checkpoint_path = os.path.join(log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()

    coord.join(threads)
    sess.close()

6.评估,用测试集测试训练的代码的正确率。

def evaluate():
    with tf.Graph().as_default():

        log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
        # 测试数据集存放地址
        test_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
        n_test = 10000

        # reading test data
        images, labels = input_data.read_cifar10(data_dir=test_dir,
                                                    is_train=False,
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False)

        logits = inference(images)  # 预测值,没用one-hot编码的
        top_k_op = tf.nn.in_top_k(logits, labels, 1)   # 得到一个bool列表,预测与标签对应为true,否则为false
        saver = tf.train.Saver(tf.global_variables())

        with tf.Session() as sess:
            # 恢复模型
            print("Reading checkpoints...")
            ckpt = tf.train.get_checkpoint_state(log_dir)
            if ckpt and ckpt.model_checkpoint_path:
                global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
                # 这里取得step的步数
                saver.restore(sess, ckpt.model_checkpoint_path)
                print('Loading success, global_step is %s' % global_step)
            else:
                print('No checkpoint file found')
                return

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            try:
                num_iter = int(math.ceil(n_test / BATCH_SIZE))  # 取整
                true_count = 0
                total_sample_count = num_iter * BATCH_SIZE
                step = 0

                while step < num_iter and not coord.should_stop():
                    predictions = sess.run([top_k_op])
                    true_count += np.sum(predictions)
                    step += 1
                    precision = true_count / total_sample_count
                    # 预测值 = 预测为true的图像除以总共的预测图像
                print('precision = %.3f' % precision)
            except Exception as e:
                coord.request_stop(e)
            finally:
                coord.request_stop()
                coord.join(threads)

完了,最后可以调用train()和evaluate来训练和评估网络性能。

if __name__=='__main__':
    train()
    # 迭代了10万次,用了2天时间,实际上在4万近5万次的时候loss就为0了。不用训练这么多次
    evaluate()

这里建议如果不想要数据计算的太准就用10k次就行了,如果计算机可以,可以尝试100k次。

附上我的测试结果,训练就不演示了。我的正确率为78.1%


7.错误分析

TypeError: Value passed to parameter 'targets' has DataType float32 not in list of allowed values: int32, int64

如果出现这个错误,是因为代码

top_k_op = tf.nn.in_top_k(logits, labels, 1)
这句造成的,这个函数tf.nn.in_top_k,不接受labels的one-hot格式,所以解决的方法很简单,把数据处理的代码改为不用one-hot处理的就好了。

感兴趣的还可以用tensorboard把我们的模型及结果可视化看看。

附全代码;

'''
这个部分主要是模型的搭建,训练以及评估

# How to run?
# 0. you need to change the data directory(目录)
# 1. run cifar10-main.py
# 2. call train() in the console to train the model
# 3. call evaluate() in the console to test on the test data

'''

# 导入模型
import os
import os.path
import math
import numpy as np
import tensorflow as tf
import input_data # 之前写的数据分批

# 设置一些超参数:batch大小,学习速率以及最大迭代次数
BATCH_SIZE = 128
learning_rate = 0.05
MAX_STEP = 100000
# with this setting, it took less than 30 mins on my laptop to train.越大训练越久,先可以训练小一点如10000
# tensorflow一样迭代100k次,可能会很久
'''
接下来网络中的一些参数初始化,如W,b、包括各层的卷积,激活,池化等,包含整个模型框架
这个框架同CATS VS DOGS框架一样
整个卷积过程就是卷积》池化》归一化》卷积》归一化》池化》全连接》全连接》softmax
对于先是正则化还是池化还没有领悟
'''
def inference(images):
    '''
    Args:
        images: 4D tensor [batch_size, img_width, img_height, img_channel]
    Notes:
        In each conv layer, the kernel size is:
        [kernel_size, kernel_size, number of input channels, number of output channels].
        number of input channels are from previuous layer, if previous layer is THE input
        layer, number of input channels should be image's channels.
    '''
    # conv1, [5, 5, 3, 96], The first two dimensions are the patch size,
    # the next is the number of input channels,
    # the last is the number of output channels
    # 输入image size = [batch_size,32,32,3]
    with tf.variable_scope('conv1') as scope:
        weights = tf.get_variable('weights',
                                  shape=[3, 3, 3, 96],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
        # 这里用的标注差为0.05,同tensorflow官网一样
        biases = tf.get_variable('biases',
                                 shape=[96],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.0))
        conv = tf.nn.conv2d(images, weights, strides=[1, 1, 1, 1], padding='SAME')
        pre_activation = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(pre_activation, name=scope.name)  # scope.name = conv1
        # 输出 size = BATCH_SIZE * 32 * 32 * 96
    # pool1 and norm1
    with tf.variable_scope('pooling1_lrn') as scope:
        pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                               padding='SAME', name='pooling1')
        #  ksize=[1, 3, 3, 1]去掉维度为1的实际就是3*3的池化
        norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
                          beta=0.75, name='norm1')
    # 输出 size = BATCH_SIZE * 32 * 32 * 96
    # conv2
    with tf.variable_scope('conv2') as scope:
        weights = tf.get_variable('weights',
                                  shape=[3, 3, 96, 64],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.05, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[64],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        conv = tf.nn.conv2d(norm1, weights, strides=[1, 1, 1, 1], padding='SAME')
        pre_activation = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(pre_activation, name='conv2')

    # pool2 and norm2
    with tf.variable_scope('pooling2_lrn') as scope:
        norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1.0, alpha=0.001 / 9.0,
                          beta=0.75, name='norm2')
        pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1],
                               padding='SAME', name='pooling2')
    # 输出 size = BATCH_SIZE * 32 * 32 * 64
    # local3 = 全连接1
    with tf.variable_scope('local3') as scope:
        reshape = tf.reshape(pool2, shape=[BATCH_SIZE, -1])
        dim = reshape.get_shape()[1].value
        # reshape的size为(BATCH_SIZE ,1024),  reshape.get_shape() = (BATCH_SIZE ,1024)
        # 而后面[1].value是第一列的值即1024
        # 所以 dim = 1024
        weights = tf.get_variable('weights',
                                  shape=[dim, 384],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[384],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
    # 输出 size = BATCH_SIZE * 384

    # local4 = = 全连接2
    with tf.variable_scope('local4') as scope:
        weights = tf.get_variable('weights',
                                  shape=[384, 192],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[192],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name='local4')
    # 输出 size = BATCH_SIZE * 192

    # softmax,输出为10
    with tf.variable_scope('softmax_linear') as scope:
        weights = tf.get_variable('softmax_linear',
                                  shape=[192, 10],
                                  dtype=tf.float32,
                                  initializer=tf.truncated_normal_initializer(stddev=0.004, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                 shape=[10],
                                 dtype=tf.float32,
                                 initializer=tf.constant_initializer(0.1))
        softmax_linear = tf.add(tf.matmul(local4, weights), biases, name='softmax_linear')
    # 输出 size = BATCH_SIZE * 10
    return softmax_linear

# 计算cost
def losses(logits, labels):
    with tf.variable_scope('loss') as scope:
        labels = tf.cast(labels, tf.int64)

        # to use this loss fuction, one-hot encoding is needed!
        # 用了one-hot,就只能用下面这个损失函数,而不能用被注释掉的那个
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits \
            (logits=logits, labels=labels, name='xentropy_per_example')
        #   这里的‘\’换行

        #        这个损失函数不需要用one-hot,可以节约一些时间,推荐
        #        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
        #                        (logits=logits, labels=labels, name='xentropy_per_example')

        loss = tf.reduce_mean(cross_entropy, name='loss')
        tf.summary.scalar(scope.name + '/loss', loss)

    return loss

# 开始训练,这里需要填入自己的数据地址
def train():
    my_global_step = tf.Variable(0, name='global_step', trainable=False)

    data_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
    # 数据保存地址
    log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'

    images, labels = input_data.read_cifar10(data_dir=data_dir,
                                                is_train=True,
                                                batch_size=BATCH_SIZE,
                                                shuffle=True)
    # input_data是数据分批的.py文件名,根据自己的来
    logits = inference(images)  # logits是softmax后的值,也是我们的预测值

    loss = losses(logits, labels)

    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.minimize(loss, global_step=my_global_step)

    saver = tf.train.Saver(tf.global_variables())
    summary_op = tf.summary.merge_all()

    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

    try:
        for step in np.arange(MAX_STEP):
            if coord.should_stop():
                break
            _, loss_value = sess.run([train_op, loss])

            if step % 50 == 0:
                print('Step: %d, loss: %.4f' % (step, loss_value))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 2000 == 0 or (step + 1) == MAX_STEP:
                checkpoint_path = os.path.join(log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()

    coord.join(threads)
    sess.close()

# 评估,注意改地址

def evaluate():
    with tf.Graph().as_default():

        log_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/logs/'
        # 测试数据集存放地址
        test_dir = 'D:/Python/neural network/CIFAR10-Guoqingxu/data/'
        n_test = 10000

        # reading test data
        images, labels = input_data.read_cifar10(data_dir=test_dir,
                                                    is_train=False,
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False)

        logits = inference(images)  # 预测值,没用one-hot编码的
        top_k_op = tf.nn.in_top_k(logits, labels, 1)   # 得到一个bool列表,预测与标签对应为true,否则为false
        saver = tf.train.Saver(tf.global_variables())

        with tf.Session() as sess:
            # 恢复模型
            print("Reading checkpoints...")
            ckpt = tf.train.get_checkpoint_state(log_dir)
            if ckpt and ckpt.model_checkpoint_path:
                global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
                # 这里取得step的步数
                saver.restore(sess, ckpt.model_checkpoint_path)
                print('Loading success, global_step is %s' % global_step)
            else:
                print('No checkpoint file found')
                return

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            try:
                num_iter = int(math.ceil(n_test / BATCH_SIZE))  # 取整
                true_count = 0
                total_sample_count = num_iter * BATCH_SIZE
                step = 0

                while step < num_iter and not coord.should_stop():
                    predictions = sess.run([top_k_op])
                    true_count += np.sum(predictions)
                    step += 1
                    precision = true_count / total_sample_count
                    # 预测值 = 预测为true的图像除以总共的预测图像
                print('precision = %.3f' % precision)
            except Exception as e:
                coord.request_stop(e)
            finally:
                coord.request_stop()
                coord.join(threads)

if __name__=='__main__':
    # train()
    # 迭代了10万次,用了2天时间,实际上在4万近5万次的时候loss就为0了。不用训练这么多次
    evaluate()







阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页