都说深度学习是炼丹术,调参是炼丹的核心技能。最近基于Imagenet的数据集,测试了一下不同的参数对于性能的影响,在此总结一下。
首先搭建一个深度的卷积神经网络,网络结构参照YOLO论文中的对Imagenet预训练的网络,即一个20层的卷积网络再加上一个全连接层,具体的网络结构代码如下:
import tensorflow as tf
def _conv(name, inputs, kernel_size, in_channels, out_channels, stride, padding, trainable, bias_init, training):
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
kernel = tf.get_variable(shape=[kernel_size,kernel_size,in_channels,out_channels], initializer=tf.contrib.layers.variance_scaling_initializer(factor=2.0,mode='FAN_IN',uniform=False), trainable=trainable, name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,stride,stride,1], padding=padding)
biases = tf.get_variable(initializer=tf.constant(bias_init, shape=[out_channels], dtype=tf.float32), trainable=trainable, name='biases')
bias = tf.nn.bias_add(conv, biases)
output = tf.nn.leaky_relu(bias, alpha=0.1, name=name)
output_bn = tf.layers.batch_normalization(output, axis=3, name='bn', trainable=trainable, training=training, reuse=tf.AUTO_REUSE)
return output_bn
def inference(images, pretrain=True, wd=None, training=True):
conv1 = _conv('conv1', images, 7, 3, 64, 2, 'SAME', pretrain, 0.01, training) #112*112*64
pool1 = tf.nn.max_pool(conv1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool1') #56*56*64
conv2 = _conv('conv2', pool1, 3, 64, 192, 1, 'SAME', pretrain, 0.01, training) #56*56*192
pool2 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool2') #28*28*192
conv3 = _conv('conv3', pool2, 1, 192, 128, 1, 'SAME', pretrain, 0.01, training) #28*28*128
conv4 = _conv('conv4', conv3, 3, 128, 256, 1, 'SAME', pretrain, 0.01, training) #28*28*256
conv5 = _conv('conv5', conv4, 1, 256, 256, 1, 'SAME', pretrain, 0.01, training) #28*28*256
conv6 = _conv('conv6', conv5, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #28*28*512
pool6 = tf.nn.max_pool(conv6, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool6') #14*14*512
conv7 = _conv('conv7', pool6, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv8 = _conv('conv8', conv7, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv9 = _conv('conv9', conv8, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv10 = _conv('conv10', conv9, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv11 = _conv('conv11', conv10, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv12 = _conv('conv12', conv11, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv13 = _conv('conv13', conv12, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training) #14*14*256
conv14 = _conv('conv14', conv13, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv15 = _conv('conv15', conv14, 1, 512, 512, 1, 'SAME', pretrain, 0.01, training) #14*14*512
conv16 = _conv('conv16', conv15, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #14*14*1024
pool16 = tf.nn.max_pool(conv16, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool16') #7*7*1024
conv17 = _conv('conv17', pool16, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
conv18 = _conv('conv18', conv17, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024
conv19 = _conv('conv19', conv18, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
conv20 = _conv('conv20', conv19, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024
avg_layer = tf.reduce_mean(conv20, axis=[1,2], keepdims=True) #1024
flatten = tf.layers.flatten(inputs=avg_layer, name='flatten')
with tf.variable_scope('local', reuse=tf.AUTO_REUSE):
weights = tf.get_variable(initializer=tf.truncated_normal([1024,1000], dtype=tf.float32, stddev=1/(1000)), trainable=pretrain, name='weights')
weight_decay = tf.multiply(tf.nn.l2_loss(weights), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(initializer=tf.constant(1.0, shape=[1000], dtype=tf.float32), trainable=pretrain, name='biases')
local = tf.nn.xw_plus_b(flatten, weights, biases, name='local')
return local
网络训练的代码如下:
import tensorflow as tf
import os
import random
import time
imageWidth = 224
imageHeight = 224
imageDepth = 3
batch_size = 112
resize_min = 256
def distort_color(image, color_ordering=0):
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)#亮度
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)#饱和度
image = tf.image.random_hue(image, max_delta=0.2)#色相
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)#对比度
if color_ordering == 1:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
if color_ordering == 2:
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
if color_ordering == 3:
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
return tf.clip_by_value(image, 0.0, 1.0)
# Parse TFRECORD and distort the image for train
def _parse_function(example_proto):
features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
"height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.FixedLenFeature([], tf.string, default_value=""),
"label": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"bbox_xmin": tf.VarLenFeature(tf.float32),
"bbox_xmax": tf.VarLenFeature(tf.float32),
"bbox_ymin": tf.VarLenFeature(tf.float32),
"bbox_ymax": tf.VarLenFeature(tf.float32),
"text": tf.FixedLenFeature([], tf.string, default_value=""),
"filename": tf.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.parse_single_example(example_proto, features)
image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
shape = tf.shape(image_decoded)
height, width = shape[0], shape[1]
resized_height, resized_width = tf.cond(height<width,
lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
resized = tf.image.resize_images(image_decoded, [resized_height, resized_width])
cropped = tf.random_crop(resized, [imageHeight, imageWidth, 3])
# Flip to add a little more random distortion in.
flipped = tf.image.random_flip_left_right(cropped)
image_train = tf.image.per_image_standardization(flipped)
#distorted_image = distort_color(flipped, np.random.randint(4))
return image_train, parsed_features["label"][0]
with tf.device('/cpu:0'):
train_files_names = os.listdir('train_tf/')
train_files = ['/home/roy/AI/train_tf/'+item for item in train_files_names]
dataset_train = tf.data.TFRecordDataset(train_files)
dataset_train = dataset_train.map(_parse_function, num_parallel_calls=4)
dataset_train = dataset_train.repeat(10)
dataset_train = dataset_train.batch(batch_size)
dataset_train = dataset_train.prefetch(batch_size)
iterator = tf.data.Iterator.from_structure(dataset_train.output_types, dataset_train.output_shapes)
next_images, next_labels = iterator.get_next()
train_init_op = iterator.make_initializer(dataset_train)
def _parse_test_function(example_proto):
features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
"height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.FixedLenFeature([], tf.string, default_value=""),
"label": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"bbox_xmin": tf.VarLenFeature(tf.float32),
"bbox_xmax": tf.VarLenFeature(tf.float32),
"bbox_ymin": tf.VarLenFeature(tf.float32),
"bbox_ymax": tf.VarLenFeature(tf.float32),
"text": tf.FixedLenFeature([], tf.string, default_value=""),
"filename": tf.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.parse_single_example(example_proto, features)
image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
shape = tf.shape(image_decoded)
height, width = shape[0], shape[1]
resized_height, resized_width = tf.cond(height<width,
lambda: (resize_min, tf.cast(tf.multiply(tf.cast(width, tf.float64),tf.divide(resize_min,height)), tf.int32)),
lambda: (tf.cast(tf.multiply(tf.cast(height, tf.float64),tf.divide(resize_min,width)), tf.int32), resize_min))
image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
image_resized = tf.image.resize_images(image_decoded, [resized_height, resized_width])
# calculate how many to be center crop
shape = tf.shape(image_resized)
height, width = shape[0], shape[1]
amount_to_be_cropped_h = (height - imageHeight)
crop_top = amount_to_be_cropped_h // 2
amount_to_be_cropped_w = (width - imageWidth)
crop_left = amount_to_be_cropped_w // 2
image_cropped = tf.slice(image_resized, [crop_top, crop_left, 0], [imageHeight, imageWidth, -1])
image_cropped = tf.image.per_image_standardization(image_cropped)
return image_cropped, parsed_features["label"][0]
with tf.device('/cpu:0'):
valid_files_names = os.listdir('valid_tf/')
valid_files = ['/home/roy/AI/valid_tf/'+item for item in valid_files_names]
dataset_valid = tf.data.TFRecordDataset(valid_files)
dataset_valid = dataset_valid.map(_parse_test_function, num_parallel_calls=4)
dataset_valid = dataset_valid.batch(batch_size)
dataset_valid = dataset_valid.prefetch(batch_size)
iterator_valid = tf.data.Iterator.from_structure(dataset_valid.output_types, dataset_valid.output_shapes)
next_valid_images, next_valid_labels = iterator_valid.get_next()
valid_init_op = iterator_valid.make_initializer(dataset_valid)
global_step = tf.Variable(0, trainable=False)
epoch_steps = int(1281167/batch_size)
#boundaries = [epoch_steps*7, epoch_steps*11]
boundaries = [60000, 80000]
values = [0.01, 0.001, 0.0001]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
lr_summary = tf.summary.scalar('learning_rate', learning_rate)
result = yolonet_model.inference(next_images, pretrain=True, wd=0.0005, training=True)
output_result_scores = tf.nn.softmax(result)
output_result = tf.argmax(output_result_scores, 1)
#Calculate the cross entropy loss
cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=next_labels, logits=result)
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
#Add the l2 weights to the loss
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
loss_summary = tf.summary.scalar('loss', loss)
#Define the optimizer
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimize_op = optimizer.minimize(loss, global_step=global_step)
#Get the inference logits by the model for the validation images
result_valid = yolonet_model.inference(next_valid_images, pretrain=True, wd=0.0005, training=False)
output_valid_scores = tf.nn.softmax(result_valid)
output_valid_result = tf.argmax(output_valid_scores, 1)
accuracy_valid_batch = tf.reduce_mean(tf.cast(tf.equal(next_valid_labels, tf.argmax(output_valid_scores, 1)), tf.float32))
accuracy_valid_top_5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(output_valid_scores, next_valid_labels, k=5), tf.float32))
acc_1_summary = tf.summary.scalar('accuracy_valid_top_1', accuracy_valid_batch)
acc_2_summary = tf.summary.scalar('accuracy_valid_top_5', accuracy_valid_top_5)
# Add ops to save and restore all the variables.
saver = tf.train.Saver()
with tf.Session() as sess:
#saver.restore(sess, "model_bn_loss1/model.ckpt-15000")
sess.run(tf.global_variables_initializer())
sess.run([train_init_op, valid_init_op])
total_loss = 0.0
epoch = 0
starttime = time.time()
while(True):
try:
loss_t, output_result_t, lr, step, _ = sess.run([loss, output_result, learning_rate, global_step, optimize_op])
total_loss += loss_t
if step%100==0:
print("step: %i, Learning_rate: %f, Time: %is Loss: %f"%(step, lr, int(time.time()-starttime), total_loss/100))
total_loss = 0.0
starttime = time.time()
if step%5000==0:
save_path = saver.save(sess, "model_bn_loss1/model.ckpt", global_step=global_step)
truepredict = 0.0
truepredict_top5 = 0.0
valid_count = 0
while(True):
try:
acc_valid_1, acc_valid_5, valid_result_t = sess.run([accuracy_valid_batch, accuracy_valid_top_5, output_valid_result])
truepredict += acc_valid_1
truepredict_top5 += acc_valid_5
valid_count += 1
#print(acc_valid_5)
except tf.errors.OutOfRangeError:
print("valid accuracy of top 1: %f" % (truepredict/valid_count))
print("valid accuracy of top 5: %f" % (truepredict_top5/valid_count))
break
starttime = time.time()
sess.run([valid_init_op])
except tf.errors.OutOfRangeError:
break
测试结论如下:
1. 在每个卷积层的输出之后增加Batch Normalization,可以加快网络收敛,提高网络性能。Batch normalization应该增加在激活函数之后。Batch normalization增加之后,在调用Optimizer之前,需要确保Batch normalization的平均值和方差都已进行Update。需要增加以下代码:
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimize_op = optimizer.minimize(loss, global_step=global_step)
另外,在训练时,tf.batch_normalization的isTraining要设置为True,在预测时要设置为False
2. 对于图像的预处理,测试了几种不同的方式:
a. 把图像的像素值转换为0-1
b. 把图像的像素值转换为均值为0的正态分布
c. 随机改变图像的对比度,饱和度,亮度,色相
经过测试,把图像像素值转换为均值为0的正态分布的效果最好
3. 对L2的weight decay的参数的测试,测试了0, 0.005, 0.0005, 0.00005这几个取值,发现0.00005这个取值效果最好
最终的训练结果为,在训练10个EPOCH后,TOP 5的准确率为83.3%, TOP 1的准确率为61.5%