接着上篇博客:“Learning to Compose with Professional Photographs on the Web” 论文解读(一)(附代码与详细注释)进行讲解。上篇博客讲到了论文中数据的处理,该博客将介绍模型的构建、训练以及评估等部分内容,并附上详细的代码及注解。
上图是本文的模型架构,非常简单,输入为两个叠加的总共为6个通道的图像(原图与裁剪图),网络中的特征提取层即为 AlexNet 的前五层卷积网络,随后接一个可选的 SPP 层,两个全连接层,损失函数为 SVM hinge 损失,最后输出一个美学得分。详细代码如下:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
def build_loss_matrix(batch_size):
loss_matrix = np.zeros(shape=(batch_size, batch_size * 2), dtype=np.float32)
for k in range(batch_size):
loss_matrix[k,k] = 1
loss_matrix[k,k+batch_size] = -1
return loss_matrix
def score(feature_vec):
W = tf.get_variable("W", shape=[feature_vec.get_shape()[1],1], initializer=tf.uniform_unit_scaling_initializer()) # init_weight([int(feature_vec.get_shape()[1]),1]) 均匀分布初始化
return tf.matmul(feature_vec,W) # tf.matmul() 矩阵相乘
def svm_loss(feature_vec, loss_matrix): # svm 损失值
q = score(feature_vec)
p = tf.matmul(loss_matrix,q)
zero = tf.constant(0.0, shape=[1], dtype=tf.float32)
p_hinge = tf.maximum(zero, 1+p) # 根据论文中的公式
L = tf.reduce_mean(p_hinge)
return L, p
def ranknet_loss(feature_vec, loss_matrix): # sigmoid 交叉熵损失值
q = score(feature_vec)
p = tf.matmul(loss_matrix,q)
L = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(p, tf.zeros_like(p), name='RankNetLoss'))
return L, p
def loss(feature_vec, loss_matrix, ranking_loss_type):
if ranking_loss_type == 'svm':
return svm_loss(feature_vec, loss_matrix)
elif ranking_loss_type == 'ranknet':
return ranknet_loss(feature_vec, loss_matrix)
else:
print "Error: ranking loss >> {} << is unknown".format(ranking_loss_type)
def conv(input, kernel, biases, k_h, k_w, c_o, s_h, s_w, padding="VALID", group=1):
'''From https://github.com/ethereon/caffe-tensorflow
封装系统卷积函数
input:具有[batch, in_height, in_width, in_channels]这样的shape的输入图像tensor
kernel:具有[filter_height, filter_width, in_channels, out_channels]这样shape的卷积核,in_channels为图像通道数,out_channels为卷积核个数
biases:偏差向量
k_h:卷积核高度
k_w:卷积核宽度
c_o:卷积核个数
s_h:stride高度步长
s_w:stride宽度步长
group:图像的第二维度
'''
c_i = input.get_shape()[-1] # 获取图像通道数
assert c_i%group==0
assert c_o%group==0
convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
if group==1:
conv = convolve(input, kernel)
else:
input_groups = tf.split(input, group, 3)
kernel_groups = tf.split(kernel, group, 3)
output_groups = [convolve(i, k) for i,k in zip(input_groups, kernel_groups)]
conv = tf.concat(output_groups, 3)
return tf.reshape(tf.nn.bias_add(conv, biases), [-1]+conv.get_shape().as_list()[1:])
# tf.nn.bias_add(conv, biases):将偏差项bias加到conv上,conv.get_shape().as_list()得到具体的尺寸, [-1]拼接在矩阵前,表示可自动推断维度大小
def get_variable_dict(net_data):
variables_dict = {
"c1w": tf.Variable(net_data["conv1"][0]),
"c1b": tf.Variable(net_data["conv1"][1]),
"c2w": tf.Variable(net_data["conv2"][0]),
"c2b": tf.Variable(net_data["conv2"][1]),
"c3w": tf.Variable(net_data["conv3"][0]),
"c3b": tf.Variable(net_data["conv3"][1]),
"c4w": tf.Variable(net_data["conv4"][0]),
"c4b": tf.Variable(net_data["conv4"][1]),
"c5w": tf.Variable(net_data["conv5"][0]),
"c5b": tf.Variable(net_data["conv5"][1])}
return variables_dict
def build_alexconvnet(images, variable_dict, embedding_dim, SPP = False, pooling = 'max'):
"""
构建网络
images:具有[batch, in_height, in_width, in_channels]这样的shape的输入图像tensor
variable_dict:预训练Alexnet模型的参数字典
embedding_dim:fc1的输出为1000个神经元
"""
#conv1
#conv(11, 11, 96, 4, 4, padding='VALID', name='conv1')
k_h = 11; k_w = 11; c_o = 96; s_h = 4; s_w = 4
conv1W = variable_dict["c1w"]
conv1b = variable_dict["c1b"]
conv1_in = conv(images, conv1W, conv1b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=1)
conv1 = tf.nn.relu(conv1_in)
#lrn1
#lrn(2, 2e-05, 0.75, name='norm1')
radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
lrn1 = tf.nn.local_response_normalization(conv1,
depth_radius=radius,
alpha=alpha,
beta=beta,
bias=bias)
#maxpool1
#max_pool(3, 3, 2, 2, padding='VALID', name='pool1')
k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
maxpool1 = tf.nn.max_pool(lrn1, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)
#conv2
#conv(5, 5, 256, 1, 1, group=2, name='conv2')
k_h = 5; k_w = 5; c_o = 256; s_h = 1; s_w = 1; group = 2
conv2W = variable_dict["c2w"]
conv2b = variable_dict["c2b"]
conv2_in = conv(maxpool1, conv2W, conv2b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
conv2 = tf.nn.relu(conv2_in)
#lrn2
#lrn(2, 2e-05, 0.75, name='norm2')
radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
lrn2 = tf.nn.local_response_normalization(conv2,
depth_radius=radius,
alpha=alpha,
beta=beta,
bias=bias)
#maxpool2
#max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
maxpool2 = tf.nn.max_pool(lrn2, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)
#conv3
#conv(3, 3, 384, 1, 1, name='conv3')
k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 1
conv3W = variable_dict["c3w"]
conv3b = variable_dict["c3b"]
conv3_in = conv(maxpool2, conv3W, conv3b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
conv3 = tf.nn.relu(conv3_in)
#conv4
#conv(3, 3, 384, 1, 1, group=2, name='conv4')
k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 2
conv4W = variable_dict["c4w"]
conv4b = variable_dict["c4b"]
conv4_in = conv(conv3, conv4W, conv4b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
conv4 = tf.nn.relu(conv4_in)
#conv5
#conv(3, 3, 256, 1, 1, group=2, name='conv5')
k_h = 3; k_w = 3; c_o = 256; s_h = 1; s_w = 1; group = 2
conv5W = variable_dict["c5w"]
conv5b = variable_dict["c5b"]
conv5_in = conv(conv4, conv5W, conv5b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
conv5 = tf.nn.relu(conv5_in)
#maxpool5
#max_pool(3, 3, 2, 2, padding='VALID', name='pool5')
with tf.variable_scope("conv5"):
k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
if pooling == 'max':
pooling_func = tf.nn.max_pool
else:
pooling_func = tf.nn.avg_pool
if SPP:
maxpool3 = pooling_func(conv5, ksize=[1, 5, 5, 1], strides=[1, 4, 4, 1], padding=padding)
maxpool2 = pooling_func(conv5, ksize=[1, 7, 7, 1], strides=[1, 6, 6, 1], padding=padding)
maxpool1 = pooling_func(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding=padding)
concat5 = tf.concat([tf.contrib.layers.flatten(maxpool1), tf.contrib.layers.flatten(maxpool2), tf.contrib.layers.flatten(maxpool3)], 1)
bn5 = concat5
else:
maxpool5 = pooling_func(conv5, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)
bn5 = tf.contrib.layers.flatten(maxpool5)
flattened_dim = int(np.prod(bn5.get_shape()[1:]))
fc6W = tf.get_variable("fc6w", [flattened_dim, embedding_dim], initializer = tf.uniform_unit_scaling_initializer()) # init_weight((flattened_dim, embedding_dim))
fc6b = tf.get_variable("fc6b", [embedding_dim], initializer = tf.constant_initializer()) #init_bias([embedding_dim])
fc6 = tf.nn.relu_layer(bn5, fc6W, fc6b) # fc6=relu(bm5 * fc6w + fc6b)
return fc6
源码托管至:network.py。
数据的提升、tfRecord 解码、训练流程的搭建以及具体训练细节等在此不作详述,贴出代码一目了然:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
import time
import imp
import network as nw
tabulate_available = False
try:
imp.find_module('tabulate')
tabulate_available = True
except ImportError:
pass
if tabulate_available:
from tabulate import tabulate # 打印表单模块
import argparse
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
})
image = tf.decode_raw(features['image_raw'], tf.uint8)
image = tf.reshape(image, [227, 227, 6])
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
return tf.split(image, 2, 2) # 3rd dimension two parts
def read_and_decode_aug(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
})
image = tf.decode_raw(features['image_raw'], tf.uint8)
image = tf.image.random_flip_left_right(tf.reshape(image, [227, 227, 6]))
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5 # 数据规整到【-0.5~0.5】
image = tf.image.random_brightness(image, 0.01)
image = tf.image.random_contrast(image, 0.95, 1.05)
return tf.split(image, 2, 2) # 在第三个维度上还原为crop与comb
def inputs(filename, batch_size, num_epochs = None, shuffle = False, aug=False):
"""
解析tfRecord获取图片
filename:tfRecord文件名
batch_size:batch_size大小
num_epochs:默认轮数
shuffle:是否打乱
aug:是否图像增强
"""
with tf.name_scope('input'): # 输出文件名到一个队列中
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=num_epochs)
# Even when reading in multiple threads, share the filename
# queue.
if aug:
crop, full = read_and_decode_aug(filename_queue)
else:
crop, full = read_and_decode(filename_queue)
if shuffle:
crops, fulls = tf.train.shuffle_batch([crop, full], batch_size=batch_size,
num_threads=4, capacity=2000 + 4 * batch_size,
enqueue_many = False, min_after_dequeue=1000)
else:
crops, fulls = tf.train.batch([crop, full], batch_size = batch_size,
num_threads = 1, capacity=100 + 3 * batch_size,
allow_smaller_final_batch=False)
# Ensures a minimum amount of shuffling of examples.
# min_after_dequeue=1000)
return tf.concat([crops, fulls], 0) # 将crops与fulls在第一维度上进行连接,横向连接
# Helper Functions
def count_tfrecords(path): # tfRecord 图片文件计数
cnt = 0
for record in tf.python_io.tf_record_iterator(path):
cnt+=1
return cnt
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--embedding_dim", help="Embedding dimension before mapping to one-dimensional score", type=int, default = 1000)
parser.add_argument("--validation_interval", help="Number of iterations after which validation is run", type=int, default = 500)
parser.add_argument("--batch_train", help="Batch size for training", type=int, default=100)
parser.add_argument("--batch_val", help="Batch size for validation", type=int, default=14)
parser.add_argument("--checkpoint_interval", help="Number of iterations after which a checkpoint file is written", type=int, default=1000)
parser.add_argument("--total_steps", help="Number of total training iterations", type=int, default=15000)
parser.add_argument("--initial_lr", help="Initial learning rate", type=float, default=0.01)
parser.add_argument("--momentum", help="Momentum coefficient", type=float, default=0.9)
parser.add_argument("--step_size", help="Number of steps after which the learning rate is reduced", type=int, default=10000)
parser.add_argument("--step_factor", help="Reduction factor for the learning rate", type=float, default=0.2)
parser.add_argument("--initial_parameters", help="Path to initial parameter file", type=str, default="alexnet.npy")
parser.add_argument("--ranking_loss", help="Type of ranking loss", type=str, choices=['ranknet', 'svm'], default='svm')
parser.add_argument("--checkpoint_name", help="Name of the checkpoint files", type=str, default='view_finding_network')
parser.add_argument("--spp", help="Whether to use spatial pyramid pooling in the last layer or not", type=str2bool, default=True)
parser.add_argument("--pooling", help="Which pooling function to use", type=str, choices=['max', 'avg'], default='max')
parser.add_argument("--augment", help="Whether to augment training data or not", type=str2bool, default=True)
parser.add_argument("--training_db", help="Path to training database", type=str, default='trn.tfrecords')
parser.add_argument("--validation_db", help="Path to validation database", type=str, default='val.tfrecords')
args = parser.parse_args()
embedding_dim = args.embedding_dim
validation_interval = args.validation_interval
batch_size_trn = args.batch_train
batch_size_val = args.batch_val
checkpoint_interval = args.checkpoint_interval
total_steps = args.total_steps
validation_instances = count_tfrecords(args.validation_db) # 测试集图片数量
initial_lr = args.initial_lr
momentum_coeff = args.momentum
step_size = args.step_size
step_factor = args.step_factor
parameter_path = args.initial_parameters # AlexNet 初始权重
ranking_loss = args.ranking_loss
experiment_name = args.ranking_loss
spp = args.spp
augment_training_data = args.augment
parameter_table = [["Initial parameters", parameter_path],
["Ranking loss", ranking_loss], ["SPP", spp], ["Pooling", args.pooling],
['Experiment', experiment_name],
['Embedding dim', embedding_dim], ['Batch size', batch_size_trn],
['Initial LR', initial_lr], ['Momentum', momentum_coeff],
['LR Step size', step_size], ['LR Step factor', step_factor],
['Total Steps', total_steps]]
training_images = inputs(args.training_db, batch_size_trn, None, True, augment_training_data) # 训练集图片(数据提升)
test_images = inputs(args.validation_db, batch_size_val, None, False) # 测试集图片
net_data = np.load(parameter_path).item() # 导入AlexNet预训练模型并解析成键值对
var_dict = nw.get_variable_dict(net_data) # 将解析出的键值对赋值给字典
with tf.variable_scope("ranker") as scope:
feature_vec = nw.build_alexconvnet(training_images, var_dict, embedding_dim, spp, args.pooling)
L, p = nw.loss(feature_vec, nw.build_loss_matrix(batch_size_trn), ranking_loss)
scope.reuse_variables() # 共享变量
val_feature_vec = nw.build_alexconvnet(test_images, var_dict, embedding_dim, spp, args.pooling)
L_val, p_val = nw.loss(val_feature_vec, nw.build_loss_matrix(batch_size_val), ranking_loss)
lr = tf.Variable(initial_lr)
opt = tf.train.AdamOptimizer()
grads = opt.compute_gradients(L) # 计算梯度
apply_grad_op = opt.apply_gradients(grads) # 应用梯度
init = tf.global_variables_initializer() # 将变量的所有初始化器进行汇总
saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) # 最多保存10个模型
config = tf.ConfigProto() # 在创建Session的时候,用来对Session进行参数配置
config.gpu_options.allow_growth = True # 当使用GPU时候,Tensorflow运行自动慢慢达到最大GPU的内存
sess = tf.Session(config=config)
sess.run(init)
coord = tf.train.Coordinator() # 创建一个线程管理器对象
threads = tf.train.start_queue_runners(sess=sess, coord=coord) # 只有调用 tf.train.start_queue_runners 之后,才会真正把tensor推入内存序列中,供计算单元调用
current_lr = initial_lr
validation_history = np.zeros(shape=(total_steps/validation_interval, 3)) # 初始化一个测试集log张量
if tabulate_available:
def generate_validation_history(step, tbl):
return tabulate(tbl, headers=['Step', 'LR', 'Loss'])
print tabulate(parameter_table)
for step in range(total_steps+1):
if step % step_size == 0 and step > 0:
current_lr *= step_factor # 每10000步后学习率衰减
print "Learning Rate: {}".format(current_lr)
if step % checkpoint_interval == 0: # 每1000步后保存一个模型
# saver.save():第二个参数设定保存的路径和名字,第三个参数将训练的次数作为后缀加入到模型名字中
saver.save(sess, 'snapshots/ranker_{}_{}.ckpt'.format(experiment_name, embedding_dim), global_step=step)
t0 = time.time()
_, loss_val = sess.run([apply_grad_op, L])
t1 = time.time()
print "Iteration {}: L={:0.4f} dT={:0.3f}".format(step, loss_val, t1-t0) # 输出步数、loss值与迭代间隔时间
if step % validation_interval == 0 and step > 0: # 每500张图测试一次,即500个batch_size为14
val_avg = 0.0
for k in range(validation_instances/batch_size_val):
val_loss = sess.run([L_val])[0]
val_avg+=val_loss
val_avg /= float(validation_instances/batch_size_val) # 测试集平均loss值
validation_history[step / validation_interval - 1] = (step, current_lr, val_avg)
if tabulate_available:
print generate_validation_history(step/validation_instances, validation_history)
else:
print "\tValidation: L={:0.4f}".format(val_avg) # 每500次迭代打印一次测试集loss
np.savez("{}_history.npz".format(experiment_name), validation=validation_history) # 将validation_history命名为validation并存储至npz文件
if tabulate_available:
print tabulate(parameter_table)
sess.close()
源码托管至:vfn_train.py。
相应的模型评价代码不再堆叠于此,已托管至:vfn_eval.py。
最后,将可视化代码也一并展示,并选取几张有代表性的处理后的图片供大家欣赏:
# -*- coding: utf-8 -*-
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw
def draw_rect(filename, rect_list):
raw_img = Image.open(filename)
draw = ImageDraw.Draw(raw_img)
x_begin = rect_list[0]
y_begin = rect_list[1]
x_end = rect_list[0] + rect_list[2]
y_end = rect_list[1] + rect_list[3]
region = (x_begin, y_begin, x_end, y_end)
croped_img = raw_img.crop(region)
draw.line([(x_begin, y_begin), (x_end, y_begin)], fill=(255, 0, 0), width=5)
draw.line([(x_end, y_begin), (x_end, y_end)], fill=(255, 0, 0), width=5)
draw.line([(x_end, y_end), (x_begin, y_end)], fill=(255, 0, 0), width=5)
draw.line([(x_begin, y_end), (x_begin, y_begin)], fill=(255, 0, 0), width=5)
return raw_img, croped_img
def show_crop_full_image(filename, ground_truth_list, crop_list):
ground_truth_img, croped_ground_truth = draw_rect(filename, ground_truth_list)
crop_img, croped_crop_img = draw_rect(filename, crop_list)
plt.subplot(2, 2, 1)
plt.imshow(ground_truth_img)
plt.axis('off')
plt.subplot(2, 2, 2)
plt.imshow(crop_img)
plt.axis('off')
plt.subplot(2, 2, 3)
plt.imshow(croped_ground_truth)
plt.axis('off')
plt.subplot(2, 2, 4)
plt.imshow(croped_crop_img)
plt.axis('off')
plt.show()
if __name__ == "__main__":
filename = 'FCDB/13091565704_a6bb18d1cb_c.jpg'
ground_truth_list = [19, 45, 768, 436]
crop_list = [16, 44, 720, 480]
show_crop_full_image(filename, ground_truth_list, crop_list)
以下是几张美学构图的结果(图像左边为groundtruth,右边为论文方法的裁剪效果),可以看出,结果有好有坏,平均的 IoU 指标为0.684:
以下两张为挑选出来的效果较为不理想的:
照片的美学构图是一个非常有意思的研究方向,因为无论是在标注上还是其他方面都存在较大的主观性,深度学习的迅猛发展必将推动该方向的进一步完善,更多有趣的产品将会依附深度学习落地。