"Learning to Compose with Professional Photographs on the Web" 论文解读（二）（附代码与详细注释）

最新推荐文章于 2022-07-04 19:20:58 发布

打工人小飞

最新推荐文章于 2022-07-04 19:20:58 发布

阅读量955

点赞数 2

分类专栏：深度学习人工智能学术论文解读文章标签：美学构图 tfRecord 解码 photo composition 图像分割 photo croping

本文链接：https://blog.csdn.net/huangfei711/article/details/81176217

版权

人工智能同时被 3 个专栏收录

42 篇文章 11 订阅

订阅专栏

深度学习

33 篇文章 6 订阅

订阅专栏

学术论文解读

9 篇文章 2 订阅

订阅专栏

接着上篇博客：“Learning to Compose with Professional Photographs on the Web” 论文解读（一）（附代码与详细注释）进行讲解。上篇博客讲到了论文中数据的处理，该博客将介绍模型的构建、训练以及评估等部分内容，并附上详细的代码及注解。
这里写图片描述
上图是本文的模型架构，非常简单，输入为两个叠加的总共为6个通道的图像（原图与裁剪图），网络中的特征提取层即为 AlexNet 的前五层卷积网络，随后接一个可选的 SPP 层，两个全连接层，损失函数为 SVM hinge 损失，最后输出一个美学得分。详细代码如下：

#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np

def build_loss_matrix(batch_size):
    loss_matrix = np.zeros(shape=(batch_size, batch_size * 2), dtype=np.float32)
    for k in range(batch_size):
        loss_matrix[k,k] = 1
        loss_matrix[k,k+batch_size] = -1
    return loss_matrix

def score(feature_vec):
    W = tf.get_variable("W", shape=[feature_vec.get_shape()[1],1], initializer=tf.uniform_unit_scaling_initializer()) # init_weight([int(feature_vec.get_shape()[1]),1]) 均匀分布初始化
    return tf.matmul(feature_vec,W)      # tf.matmul() 矩阵相乘

def svm_loss(feature_vec, loss_matrix):  # svm 损失值
    q = score(feature_vec)
    p = tf.matmul(loss_matrix,q)
    zero = tf.constant(0.0, shape=[1], dtype=tf.float32)
    p_hinge = tf.maximum(zero, 1+p)     # 根据论文中的公式
    L = tf.reduce_mean(p_hinge)
    return L, p

def ranknet_loss(feature_vec, loss_matrix):  # sigmoid 交叉熵损失值
    q = score(feature_vec)
    p = tf.matmul(loss_matrix,q)
    L = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(p, tf.zeros_like(p), name='RankNetLoss'))
    return L, p

def loss(feature_vec, loss_matrix, ranking_loss_type):
    if ranking_loss_type == 'svm':
        return svm_loss(feature_vec, loss_matrix)
    elif ranking_loss_type == 'ranknet':
        return ranknet_loss(feature_vec, loss_matrix)
    else:
        print "Error: ranking loss >> {} << is unknown".format(ranking_loss_type)

def conv(input, kernel, biases, k_h, k_w, c_o, s_h, s_w,  padding="VALID", group=1):
    '''From https://github.com/ethereon/caffe-tensorflow
        封装系统卷积函数
    input：具有[batch, in_height, in_width, in_channels]这样的shape的输入图像tensor
    kernel：具有[filter_height, filter_width, in_channels, out_channels]这样shape的卷积核，in_channels为图像通道数，out_channels为卷积核个数
    biases：偏差向量
    k_h：卷积核高度
    k_w：卷积核宽度
    c_o：卷积核个数
    s_h：stride高度步长
    s_w：stride宽度步长
    group：图像的第二维度
    '''
    c_i = input.get_shape()[-1]     # 获取图像通道数
    assert c_i%group==0
    assert c_o%group==0
    convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)  
    if group==1:
        conv = convolve(input, kernel)
    else:
        input_groups = tf.split(input, group, 3)
        kernel_groups = tf.split(kernel, group, 3)
        output_groups = [convolve(i, k) for i,k in zip(input_groups, kernel_groups)]
        conv = tf.concat(output_groups, 3)
    return  tf.reshape(tf.nn.bias_add(conv, biases), [-1]+conv.get_shape().as_list()[1:])  
    # tf.nn.bias_add(conv, biases):将偏差项bias加到conv上,conv.get_shape().as_list()得到具体的尺寸, [-1]拼接在矩阵前,表示可自动推断维度大小

def get_variable_dict(net_data):
    variables_dict = {
        "c1w": tf.Variable(net_data["conv1"][0]),
        "c1b": tf.Variable(net_data["conv1"][1]),

        "c2w": tf.Variable(net_data["conv2"][0]),
        "c2b": tf.Variable(net_data["conv2"][1]),

        "c3w": tf.Variable(net_data["conv3"][0]),
        "c3b": tf.Variable(net_data["conv3"][1]),

        "c4w": tf.Variable(net_data["conv4"][0]),
        "c4b": tf.Variable(net_data["conv4"][1]),

        "c5w": tf.Variable(net_data["conv5"][0]),
        "c5b": tf.Variable(net_data["conv5"][1])}
    return variables_dict

def build_alexconvnet(images, variable_dict, embedding_dim, SPP = False, pooling = 'max'):
    """
        构建网络
    images：具有[batch, in_height, in_width, in_channels]这样的shape的输入图像tensor
    variable_dict：预训练Alexnet模型的参数字典
    embedding_dim：fc1的输出为1000个神经元
    """
    #conv1
    #conv(11, 11, 96, 4, 4, padding='VALID', name='conv1')
    k_h = 11; k_w = 11; c_o = 96; s_h = 4; s_w = 4
    conv1W = variable_dict["c1w"]
    conv1b = variable_dict["c1b"]
    conv1_in = conv(images, conv1W, conv1b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=1)
    conv1 = tf.nn.relu(conv1_in)
    #lrn1
    #lrn(2, 2e-05, 0.75, name='norm1')
    radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
    lrn1 = tf.nn.local_response_normalization(conv1,
                                                      depth_radius=radius,
                                                      alpha=alpha,
                                                      beta=beta,
                                                      bias=bias)
    #maxpool1
    #max_pool(3, 3, 2, 2, padding='VALID', name='pool1')
    k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
    maxpool1 = tf.nn.max_pool(lrn1, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)
    #conv2
    #conv(5, 5, 256, 1, 1, group=2, name='conv2')
    k_h = 5; k_w = 5; c_o = 256; s_h = 1; s_w = 1; group = 2
    conv2W = variable_dict["c2w"]
    conv2b = variable_dict["c2b"]
    conv2_in = conv(maxpool1, conv2W, conv2b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv2 = tf.nn.relu(conv2_in)
    #lrn2
    #lrn(2, 2e-05, 0.75, name='norm2')
    radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
    lrn2 = tf.nn.local_response_normalization(conv2,
                                                      depth_radius=radius,
                                                      alpha=alpha,
                                                      beta=beta,
                                                      bias=bias)
    #maxpool2
    #max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
    k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
    maxpool2 = tf.nn.max_pool(lrn2, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)
    #conv3
    #conv(3, 3, 384, 1, 1, name='conv3')
    k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 1
    conv3W = variable_dict["c3w"]
    conv3b = variable_dict["c3b"]
    conv3_in = conv(maxpool2, conv3W, conv3b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv3 = tf.nn.relu(conv3_in)
    #conv4
    #conv(3, 3, 384, 1, 1, group=2, name='conv4')
    k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 2
    conv4W = variable_dict["c4w"]
    conv4b = variable_dict["c4b"]
    conv4_in = conv(conv3, conv4W, conv4b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv4 = tf.nn.relu(conv4_in)
    #conv5
    #conv(3, 3, 256, 1, 1, group=2, name='conv5')
    k_h = 3; k_w = 3; c_o = 256; s_h = 1; s_w = 1; group = 2
    conv5W = variable_dict["c5w"]
    conv5b = variable_dict["c5b"]
    conv5_in = conv(conv4, conv5W, conv5b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
    conv5 = tf.nn.relu(conv5_in)
    #maxpool5
    #max_pool(3, 3, 2, 2, padding='VALID', name='pool5')
    with tf.variable_scope("conv5"):
        k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
        if pooling == 'max':
            pooling_func = tf.nn.max_pool
        else:
            pooling_func = tf.nn.avg_pool
        if SPP:
            maxpool3 = pooling_func(conv5, ksize=[1, 5, 5, 1], strides=[1, 4, 4, 1], padding=padding)
            maxpool2 = pooling_func(conv5, ksize=[1, 7, 7, 1], strides=[1, 6, 6, 1], padding=padding)
            maxpool1 = pooling_func(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding=padding)
            concat5 = tf.concat([tf.contrib.layers.flatten(maxpool1), tf.contrib.layers.flatten(maxpool2), tf.contrib.layers.flatten(maxpool3)], 1)
            bn5 = concat5
        else:
            maxpool5 = pooling_func(conv5, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)
            bn5 = tf.contrib.layers.flatten(maxpool5)
    flattened_dim = int(np.prod(bn5.get_shape()[1:]))
    fc6W = tf.get_variable("fc6w", [flattened_dim, embedding_dim], initializer = tf.uniform_unit_scaling_initializer()) # init_weight((flattened_dim, embedding_dim))
    fc6b = tf.get_variable("fc6b", [embedding_dim], initializer = tf.constant_initializer())  #init_bias([embedding_dim])
    fc6 = tf.nn.relu_layer(bn5, fc6W, fc6b)    # fc6=relu(bm5 * fc6w + fc6b)

    return fc6

源码托管至：network.py。

数据的提升、tfRecord 解码、训练流程的搭建以及具体训练细节等在此不作详述，贴出代码一目了然：

# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np
import time
import imp
import network as nw
tabulate_available = False
try:
    imp.find_module('tabulate')
    tabulate_available = True
except ImportError:
    pass
if tabulate_available:
    from tabulate import tabulate       # 打印表单模块
import argparse

def read_and_decode(filename_queue):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(
        serialized_example,
      # Defaults are not specified since both keys are required.
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
        })

    image = tf.decode_raw(features['image_raw'], tf.uint8)
    image = tf.reshape(image, [227, 227, 6])

  # Convert from [0, 255] -> [-0.5, 0.5] floats.
    image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
    return tf.split(image, 2, 2) # 3rd dimension two parts

def read_and_decode_aug(filename_queue):
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(
        serialized_example,
      # Defaults are not specified since both keys are required.
        features={
            'image_raw': tf.FixedLenFeature([], tf.string),
        })

    image = tf.decode_raw(features['image_raw'], tf.uint8)
    image = tf.image.random_flip_left_right(tf.reshape(image, [227, 227, 6]))  
  # Convert from [0, 255] -> [-0.5, 0.5] floats.
    image = tf.cast(image, tf.float32) * (1. / 255) - 0.5  # 数据规整到【-0.5~0.5】
    image = tf.image.random_brightness(image, 0.01)
    image = tf.image.random_contrast(image, 0.95, 1.05)
    return tf.split(image, 2, 2) # 在第三个维度上还原为crop与comb

def inputs(filename, batch_size, num_epochs = None, shuffle = False, aug=False):
    """
        解析tfRecord获取图片
    filename：tfRecord文件名
    batch_size：batch_size大小
    num_epochs：默认轮数
    shuffle：是否打乱
    aug：是否图像增强
    """
    with tf.name_scope('input'):      # 输出文件名到一个队列中
        filename_queue = tf.train.string_input_producer(           
            [filename], num_epochs=num_epochs)       

    # Even when reading in multiple threads, share the filename
    # queue.
    if aug:
        crop, full = read_and_decode_aug(filename_queue)
    else:
        crop, full = read_and_decode(filename_queue)

    if shuffle:
        crops, fulls = tf.train.shuffle_batch([crop, full], batch_size=batch_size,
                                         num_threads=4, capacity=2000 + 4 * batch_size,
                                         enqueue_many = False, min_after_dequeue=1000)
    else:
        crops, fulls = tf.train.batch([crop, full], batch_size = batch_size,
                                      num_threads = 1, capacity=100 + 3 * batch_size,
                                      allow_smaller_final_batch=False)
        # Ensures a minimum amount of shuffling of examples.
#            min_after_dequeue=1000)

    return tf.concat([crops, fulls], 0)    # 将crops与fulls在第一维度上进行连接，横向连接

# Helper Functions

def count_tfrecords(path):      # tfRecord 图片文件计数
    cnt = 0
    for record in tf.python_io.tf_record_iterator(path):
        cnt+=1
    return cnt

def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--embedding_dim", help="Embedding dimension before mapping to one-dimensional score", type=int, default = 1000)
    parser.add_argument("--validation_interval", help="Number of iterations after which validation is run", type=int, default = 500)
    parser.add_argument("--batch_train", help="Batch size for training", type=int, default=100)
    parser.add_argument("--batch_val", help="Batch size for validation", type=int, default=14)
    parser.add_argument("--checkpoint_interval", help="Number of iterations after which a checkpoint file is written", type=int, default=1000)
    parser.add_argument("--total_steps", help="Number of total training iterations", type=int, default=15000)
    parser.add_argument("--initial_lr", help="Initial learning rate", type=float, default=0.01)
    parser.add_argument("--momentum", help="Momentum coefficient", type=float, default=0.9)
    parser.add_argument("--step_size", help="Number of steps after which the learning rate is reduced", type=int, default=10000)
    parser.add_argument("--step_factor", help="Reduction factor for the learning rate", type=float, default=0.2)
    parser.add_argument("--initial_parameters", help="Path to initial parameter file", type=str, default="alexnet.npy")
    parser.add_argument("--ranking_loss", help="Type of ranking loss", type=str, choices=['ranknet', 'svm'], default='svm')
    parser.add_argument("--checkpoint_name", help="Name of the checkpoint files", type=str, default='view_finding_network')
    parser.add_argument("--spp", help="Whether to use spatial pyramid pooling in the last layer or not", type=str2bool, default=True)
    parser.add_argument("--pooling", help="Which pooling function to use", type=str, choices=['max', 'avg'], default='max')
    parser.add_argument("--augment", help="Whether to augment training data or not", type=str2bool, default=True)
    parser.add_argument("--training_db", help="Path to training database", type=str, default='trn.tfrecords')
    parser.add_argument("--validation_db", help="Path to validation database", type=str, default='val.tfrecords')

    args = parser.parse_args()
    embedding_dim = args.embedding_dim
    validation_interval = args.validation_interval
    batch_size_trn = args.batch_train
    batch_size_val = args.batch_val
    checkpoint_interval = args.checkpoint_interval
    total_steps = args.total_steps
    validation_instances = count_tfrecords(args.validation_db)    # 测试集图片数量
    initial_lr = args.initial_lr
    momentum_coeff = args.momentum
    step_size = args.step_size
    step_factor = args.step_factor
    parameter_path = args.initial_parameters       # AlexNet 初始权重
    ranking_loss = args.ranking_loss
    experiment_name = args.ranking_loss
    spp = args.spp
    augment_training_data = args.augment

    parameter_table = [["Initial parameters", parameter_path],
                    ["Ranking loss", ranking_loss], ["SPP", spp], ["Pooling", args.pooling],
                    ['Experiment', experiment_name],
                    ['Embedding dim', embedding_dim], ['Batch size', batch_size_trn],
                    ['Initial LR', initial_lr], ['Momentum', momentum_coeff],
                    ['LR Step size', step_size], ['LR Step factor', step_factor],
                    ['Total Steps', total_steps]]

    training_images = inputs(args.training_db, batch_size_trn, None, True, augment_training_data)    # 训练集图片（数据提升）
    test_images = inputs(args.validation_db, batch_size_val, None, False)                            # 测试集图片
    net_data = np.load(parameter_path).item()           # 导入AlexNet预训练模型并解析成键值对
    var_dict = nw.get_variable_dict(net_data)           # 将解析出的键值对赋值给字典
    with tf.variable_scope("ranker") as scope:
        feature_vec = nw.build_alexconvnet(training_images, var_dict, embedding_dim, spp, args.pooling)
        L, p = nw.loss(feature_vec, nw.build_loss_matrix(batch_size_trn), ranking_loss)
        scope.reuse_variables()  # 共享变量
        val_feature_vec = nw.build_alexconvnet(test_images, var_dict, embedding_dim, spp, args.pooling)
        L_val, p_val = nw.loss(val_feature_vec, nw.build_loss_matrix(batch_size_val), ranking_loss)

    lr = tf.Variable(initial_lr)
    opt = tf.train.AdamOptimizer()
    grads = opt.compute_gradients(L)     # 计算梯度
    apply_grad_op = opt.apply_gradients(grads)    # 应用梯度
    init = tf.global_variables_initializer()      # 将变量的所有初始化器进行汇总
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)   # 最多保存10个模型
    config = tf.ConfigProto()                     # 在创建Session的时候，用来对Session进行参数配置
    config.gpu_options.allow_growth = True        # 当使用GPU时候，Tensorflow运行自动慢慢达到最大GPU的内存
    sess = tf.Session(config=config)
    sess.run(init)
    coord = tf.train.Coordinator()                # 创建一个线程管理器对象
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)  # 只有调用 tf.train.start_queue_runners 之后，才会真正把tensor推入内存序列中，供计算单元调用
    current_lr = initial_lr
    validation_history = np.zeros(shape=(total_steps/validation_interval, 3))  # 初始化一个测试集log张量
    if tabulate_available:
        def generate_validation_history(step, tbl):
            return tabulate(tbl, headers=['Step', 'LR', 'Loss'])
        print tabulate(parameter_table)

    for step in range(total_steps+1):
        if step % step_size == 0 and step > 0:
            current_lr *= step_factor           # 每10000步后学习率衰减
            print "Learning Rate: {}".format(current_lr)
        if step % checkpoint_interval == 0:     # 每1000步后保存一个模型
            # saver.save():第二个参数设定保存的路径和名字,第三个参数将训练的次数作为后缀加入到模型名字中
            saver.save(sess, 'snapshots/ranker_{}_{}.ckpt'.format(experiment_name, embedding_dim), global_step=step)
        t0 = time.time()
        _, loss_val = sess.run([apply_grad_op, L])
        t1 = time.time()
        print "Iteration {}: L={:0.4f} dT={:0.3f}".format(step, loss_val, t1-t0)    # 输出步数、loss值与迭代间隔时间
        if step % validation_interval == 0 and step > 0:     # 每500张图测试一次，即500个batch_size为14
            val_avg = 0.0
            for k in range(validation_instances/batch_size_val):
                val_loss = sess.run([L_val])[0]
                val_avg+=val_loss
            val_avg /= float(validation_instances/batch_size_val)   # 测试集平均loss值
            validation_history[step / validation_interval - 1] = (step, current_lr, val_avg)
            if tabulate_available:
                print generate_validation_history(step/validation_instances, validation_history)
            else:
                print "\tValidation: L={:0.4f}".format(val_avg)     # 每500次迭代打印一次测试集loss
            np.savez("{}_history.npz".format(experiment_name), validation=validation_history) # 将validation_history命名为validation并存储至npz文件
    if tabulate_available:
        print tabulate(parameter_table)
    sess.close()

源码托管至：vfn_train.py。
相应的模型评价代码不再堆叠于此，已托管至：vfn_eval.py。

最后，将可视化代码也一并展示，并选取几张有代表性的处理后的图片供大家欣赏：

# -*- coding: utf-8 -*-

from matplotlib import pyplot as plt
from PIL import Image, ImageDraw

def draw_rect(filename, rect_list):
    raw_img = Image.open(filename)
    draw = ImageDraw.Draw(raw_img)
    x_begin = rect_list[0]
    y_begin = rect_list[1]
    x_end = rect_list[0] + rect_list[2]
    y_end = rect_list[1] + rect_list[3]
    region = (x_begin, y_begin, x_end, y_end)
    croped_img = raw_img.crop(region)
    draw.line([(x_begin, y_begin), (x_end, y_begin)], fill=(255, 0, 0), width=5)
    draw.line([(x_end, y_begin), (x_end, y_end)], fill=(255, 0, 0), width=5)
    draw.line([(x_end, y_end), (x_begin, y_end)], fill=(255, 0, 0), width=5)
    draw.line([(x_begin, y_end), (x_begin, y_begin)], fill=(255, 0, 0), width=5)
    return raw_img, croped_img

def show_crop_full_image(filename, ground_truth_list, crop_list):
    ground_truth_img, croped_ground_truth = draw_rect(filename, ground_truth_list)
    crop_img, croped_crop_img = draw_rect(filename, crop_list)
    plt.subplot(2, 2, 1)
    plt.imshow(ground_truth_img)
    plt.axis('off')
    plt.subplot(2, 2, 2)
    plt.imshow(crop_img)
    plt.axis('off')
    plt.subplot(2, 2, 3)
    plt.imshow(croped_ground_truth)
    plt.axis('off')
    plt.subplot(2, 2, 4)
    plt.imshow(croped_crop_img)
    plt.axis('off')
    plt.show()

if __name__ == "__main__":
    filename = 'FCDB/13091565704_a6bb18d1cb_c.jpg'
    ground_truth_list = [19, 45, 768, 436]
    crop_list = [16, 44, 720, 480]
    show_crop_full_image(filename, ground_truth_list, crop_list)

以下是几张美学构图的结果（图像左边为groundtruth，右边为论文方法的裁剪效果），可以看出，结果有好有坏，平均的 IoU 指标为0.684：
这里写图片描述

以下两张为挑选出来的效果较为不理想的：

照片的美学构图是一个非常有意思的研究方向，因为无论是在标注上还是其他方面都存在较大的主观性，深度学习的迅猛发展必将推动该方向的进一步完善，更多有趣的产品将会依附深度学习落地。

打工人小飞

关注

2
点赞
踩
1

收藏

觉得还不错? 一键收藏
2
评论
"Learning to Compose with Professional Photographs on the Web" 论文解读（二）（附代码与详细注释）

接着上篇博客：“Learning to Compose with Professional Photographs on the Web” 论文解读（一）（附代码与详细注释）进行讲解。上篇博客讲到了论文中数据的处理，该博客将介绍模型的构建、训练以及评估等部分内容，并附上详细的代码及注解。上图是本文的模型架构，非常简单，输入为两个叠加的总共为6个通道的图像（原图与裁剪图），网络中的特征提取层...
复制链接

扫一扫