文章目录
前言
学习笔记:深度学习(3)——卷积神经网络(CNN)理论篇_cnn理论-CSDN博客
学习图像风格迁移前,需要掌握一些深度学习的卷积神经网络知识,可以参考上面这篇文章。
简单来说,卷积神经网络(Convolutional Neural Networks, CNNs)是一种深度学习架构,特别适合于处理具有网格结构的数据,如图像(2D网格)和声音(1D网格)。CNN通过使用卷积层自动且有效地捕捉空间和时间上的局部模式来学习数据的特征。
而VGG16是一种深度卷积神经网络(Convolutional Neural Network, CNN)模型。它由牛津大学的视觉几何组(Visual Geometry Group)开发,因此得名VGG。VGG16特别指的是这个系列模型中包含16层权重的版本,它在图像识别和图像分类任务中表现出色。
本次图像风格迁移的学习使用VGG16预训练模型中的卷积层、池化层来进行图像特征的提取,以实现生成图与内容图、风格图的特征差异量化,从而进行特征调整实现风格迁移。
1. 算法的基本原理
算法大致分为四个部分:
- 随机初始化白噪声图像G,G使用三维向量(长,宽,颜色通道数)表示。
- 将图像G输入VGG16/19模型中进行不同卷积层的特征提取,根据对内容图和风格图提取的特征计算出损失函数。损失函数是指生成图片相对于内容图片C的内容差异和风格图片S的风格差异的定量表示。
- 计算出损失函数对于每个像素的梯度。梯度是指对于当前像素点处的损失函数值下降最快的方向,以便后续沿该向量方向调整像素值,减小整个图像的损失,使生成图达到要求。
- 将每个像素沿着梯度方向微调,以减少总损失。未训练完成返回步骤2,否则输出最终图像。
2. 代码实现
VGG16预训练模型类的封装
构建VGG网络类,初始化将VGG16预处理模型参数读入,使用build函数进行模型不同层次的构建生成。
import os
import math
import numpy as np
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
from PIL import Image
import time
VGG_MEAN = [103.939, 116.779, 123.68] # RGB三通道的均值数组, 对输入数据进行标准化
class VGGNet:
# 构造VGG-16网络结构,从预训练模型加载参数
def __init__(self, data_dict):
self.data_dict = data_dict
# 获取卷积层的权重
def get_conv_filter(self, name):
return tf.constant(self.data_dict[name][0], name='conv')
# 获取全连接层的权重
def get_fc_weight(self, name):
return tf.constant(self.data_dict[name][0], name='fc')
# 获取预训练模型中偏置数据
def get_bias(self, name):
return tf.constant(self.data_dict[name][1], name='bias')
# 构造卷积层
def conv_layer(self, x, name):
with tf.name_scope(name):
conv_w = self.get_conv_filter(name)
conv_b = self.get_bias(name)
h = tf.nn.conv2d(x, conv_w, [1,1,1,1], padding='SAME')
h = tf.nn.bias_add(h, conv_b)
h = tf.nn.relu(h)
return h
# 构造池化层
def pooling_layer(self, x, name):
return tf.nn.max_pool(x,
ksize = [1,2,2,1],
strides = [1,2,2,1],
padding = 'SAME',
name = name)
# 构造全连接层
def fc_layer(self, x, name, activation=tf.nn.relu):
with tf.name_scope(name):
fc_w = self.get_fc_weight(name)
fc_b = self.get_bias(name)
h = tf.matmul(x, fc_w)
h = tf.nn.bias_add(h, fc_b)
if activation is None:
return h
else:
return activation(h)
# 构造展平层
def flatten_layer(self, x, name):
with tf.name_scope(name):
# [batch_size, image_width, image_height, channel]
x_shape = x.get_shape().as_list()
dim = 1
for d in x_shape[1:]:
dim *= d
x = tf.reshape(x, [-1, dim])
return x
# 模型构建
def build(self, x_rgb):
"""
参数:
- x_rgb: [1, 224, 224, 3], [1, 600, 800, 3], 根据图像分辨率调整
"""
start_time = time.time()
print('building model ...')
r, g, b = tf.split(x_rgb, [1,1,1], axis=3)
x_bgr = tf.concat(
[b - VGG_MEAN[0],
g - VGG_MEAN[1],
r - VGG_MEAN[2],
], axis = 3,
)
assert x_bgr.get_shape().as_list()[1:] == [600, 800, 3] # [224, 224, 3], [600, 800, 3]
self.conv1_1 = self.conv_layer(x_bgr, 'conv1_1')
self.conv1_2 = self.conv_layer(self.conv1_1, 'conv1_2')
self.pool1 = self.pooling_layer(self.conv1_2, 'pool1')
self.conv2_1 = self.conv_layer(self.pool1, 'conv2_1')
self.conv2_2 = self.conv_layer(self.conv2_1, 'conv2_2')
self.pool2 = self.pooling_layer(self.conv2_2, 'pool2')
self.conv3_1 = self.conv_layer(self.pool2, 'conv3_1')
self.conv3_2 = self.conv_layer(self.conv3_1, 'conv3_2')
self.conv3_3 = self.conv_layer(self.conv3_2, 'conv3_3')
self.pool3 = self.pooling_layer(self.conv3_3, 'pool3')
self.conv4_1 = self.conv_layer(self.pool3, 'conv4_1')
self.conv4_2 = self.conv_layer(self.conv4_1, 'conv4_2')
self.conv4_3 = self.conv_layer(self.conv4_2, 'conv4_3')
self.pool4 = self.pooling_layer(self.conv4_3, 'pool4')
self.conv5_1 = self.conv_layer(self.pool4, 'conv5_1')
self.conv5_2 = self.conv_layer(self.conv5_1, 'conv5_2')
self.conv5_3 = self.conv_layer(self.conv5_2, 'conv5_3')
self.pool5 = self.pooling_layer(self.conv5_3, 'pool5')
# self.flatten5 = self.flatten_layer(self.pool5, 'flatten')
# self.fc6 = self.fc_layer(self.flatten5, 'fc6')
# self.fc7 = self.fc_layer(self.fc6, 'fc7')
# self.fc8 = self.fc_layer(self.fc7, 'fc8', activation=None)
# self.prob = tf.nn.softmax(self.fc8, name='prob') # 输出概率
print('building model finished: %4ds' % (time.time() - start_time))
损失函数的计算
分别将结果图的内容特征和风格特征与内容图和风格图相应特征进行计算,得到内容损失,风格损失,总损失。
# 图片路径
vgg16_npy_path = 'vgg16.npy'
content_img_path = 'content/GuGong.jpg'
style_img_path = 'style/StarryNight2.jpg'
# 训练次数与学习率
num_steps = 100
learning_rate = 10
# 内容与风格权重,依据情况调整
lambda_c = 0.10
lambda_s = 5
output_dir = 'transfer'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
def initial_result(shape, mean, stddev):
initial = tf.compat.v1.truncated_normal(shape, mean = mean, stddev = stddev)
return tf.Variable(initial)
def read_img(img_name):
img = Image.open(img_name)
np_img = np.array(img) # (224, 224, 3)
np_img = np.asarray([np_img], dtype=np.int32) # (1, 224, 224, 3)
return np_img
# 计算格莱姆矩阵
def gram_matrix(x):
b, w, h, ch = x.get_shape().as_list()
features = tf.reshape(x, [b, h*w, ch])
gram = tf.matmul(features, features, adjoint_a=True) \
/ tf.constant(ch * w * h, tf.float32)
return gram
result = initial_result((1, 600, 800, 3), 127.5, 20) # 224, 224 600, 800
content_val = read_img(content_img_path)
style_val = read_img(style_img_path)
tf.compat.v1.disable_eager_execution()
content = tf.compat.v1.placeholder(tf.float32, shape=[1, 600, 800, 3])
style = tf.compat.v1.placeholder(tf.float32, shape=[1, 600, 800, 3])
data_dict = np.load(vgg16_npy_path, allow_pickle=True, encoding="latin1").item()
vgg_for_content = VGGNet(data_dict)
vgg_for_style = VGGNet(data_dict)
vgg_for_result = VGGNet(data_dict)
vgg_for_content.build(content)
vgg_for_style.build(style)
vgg_for_result.build(result)
# 内容特征组,尽量选择靠前的卷积层,每组内的选择保持一致
content_features = [
vgg_for_content.conv1_2,
vgg_for_content.conv2_2,
# vgg_for_content.conv3_3,
# vgg_for_content.conv4_3,
# vgg_for_content.conv5_3
]
result_content_features = [
vgg_for_result.conv1_2,
vgg_for_result.conv2_2,
# vgg_for_result.conv3_3,
# vgg_for_result.conv4_3,
# vgg_for_result.conv5_3
]
# 风格特征组,每组内的选择保持一致
style_features = [
# vgg_for_style.conv1_2,
vgg_for_style.conv2_2,
vgg_for_style.conv3_3,
vgg_for_style.conv4_3,
vgg_for_style.conv5_3
]
style_gram = [gram_matrix(feature) for feature in style_features]
result_style_features = [
# vgg_for_result.conv1_2,
vgg_for_result.conv2_2,
vgg_for_result.conv3_3,
vgg_for_result.conv4_3,
vgg_for_result.conv5_3
]
result_style_gram = [gram_matrix(feature) for feature in result_style_features]
# 计算内容损失
content_loss = tf.zeros(1, tf.float32)
for c, c_ in zip(content_features, result_content_features):
content_loss += tf.reduce_mean((c - c_) ** 2, [1, 2, 3])
# 计算风格损失
style_loss = tf.zeros(1, tf.float32)
for s, s_ in zip(style_gram, result_style_gram):
style_loss += tf.reduce_mean((s - s_) ** 2, [1, 2])
# 总损失与梯度计算
loss = content_loss * lambda_c + style_loss * lambda_s
train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)
init_op = tf.compat.v1.global_variables_initializer()
图像风格转换训练
# 训练,num_steps为迭代次数
with tf.compat.v1.Session() as sess:
sess.run(init_op)
for step in range(num_steps):
loss_value, content_loss_value, style_loss_value, _ \
= sess.run([loss, content_loss, style_loss, train_op],
feed_dict = {
content: content_val,
style: style_val,
})
print('step: %d, loss_value: %8.4f, content_loss: %8.4f, style_loss: %8.4f'\
% (step+1,
loss_value[0],
content_loss_value[0],
style_loss_value[0]))
result_img_path = os.path.join(
output_dir, 'result-%05d.jpg' % (step + 1)
)
result_val = result.eval(sess)[0]
result_val = np.clip(result_val, 0, 255)
img_arr = np.asarray(result_val, np.uint8)
img = Image.fromarray(img_arr)
img.save(result_img_path)
运行结果
3. 风格转换结果展示
训练1
训练2
小结
- 通过本次学习,我了解了图片风格迁移的基本原理,能够简单编写代码使用VGG16模型进行图片风格迁移训练,从而生成不同风格的图片,并体会到AI中的艺术。
- 本人在学习图片风格迁移前没有机器学习和深度学习基础,所以文中有很多地方的描述可能不太准确,希望大家能批评指正!
参考资源
[1] Gatys, Leon & Ecker, Alexander & Bethge, Matthias. (2015). A Neural Algorithm of Artistic Style. arXiv. 10.1167/16.12.326.
[2] 【编程奇妙夜】神经网络风格迁移-手机街拍秒变世界名画_哔哩哔哩_bilibili
[3] 太神奇了!基于卷积神经网络的图像风格迁移,算法讲解+代码实现+效果展示!图像风格转换居然如此丝滑!人工智能_AI_计算机视觉_哔哩哔哩_bilibili
资源分享(源代码,VGG16模型,训练图片)
链接:https://pan.baidu.com/s/17g1-qGwgDg2nhJZ9DvwQxA?pwd=uki6
提取码:uki6