show ad tell:a neuralimge caption generator(cvpr2015)
这篇论文的贡献有:
1.提出一种端到端的神经网络,可以使用随机梯度下降直接训练
2.结合了两种出色的神经网络,可以分别在额外数据集上预训练
3.在公开数据集上得到了出色的效果
模型:
目标函数:极大化目标句子的似然函数
θ
∗
=
a
r
g
A
m
a
x
∑
(
I
,
S
)
l
o
g
p
(
S
∣
I
;
θ
)
\theta^*=\ arg {}_{A}^{max} \sum_{(I,S)}logp(S|I;\theta)
θ∗= argAmax(I,S)∑logp(S∣I;θ)
进一步,应用链式法则对对数似然函数分解,句子S由N个词组成,每一个词对应一个时刻t,故而有:
l
o
g
p
(
S
∣
I
)
=
∑
t
=
0
N
l
o
g
p
(
S
t
∣
I
,
S
0
,
.
.
.
.
,
S
t
−
1
)
logp(S|I)=\sum_{t=0}^{N}logp(S_{t}|I,S_{0},....,S_{t-1})
logp(S∣I)=t=0∑Nlogp(St∣I,S0,....,St−1)
即对于输入图像I,输出句子S的概率为每一时刻生成对应词St概率的乘积,如果取对数概率,则分解为每一时刻词对数概率的和。
作者发现该公式恰好可以对应到RNN网络结构中,在t时刻,需要计算当前词St的概率,则可以将历史词S0到St−1表示为一个确定长度的隐含层神经元向量ht,同时输入图像xt,在t+1时刻,隐含层向量被更新:
h
t
+
1
=
f
(
h
t
,
x
t
)
h_{t+1}=f(h_{t},x_{t})
ht+1=f(ht,xt)
训练:
这里主要指出了需要注意的几个点:
各个时刻的LSTM单元共享一套参数
单词采用one-hot的表示方法
每一个句子前后都有标志词,表示句子的开始和结束
图像只需要输入一次,作者试验过图像输入到每一时刻的lstm中,结果因噪声很容易过拟合
采用对数损失函数:
L
(
I
,
S
)
=
−
∑
t
=
1
N
l
o
g
p
t
(
S
t
)
L(I,S)=-\sum_{t=1}^{N}logp_{t}(S_{t})
L(I,S)=−∑t=1Nlogpt(St)
model:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
"""Load the pretrained ResNet-152 and replace top fc layer."""
super(EncoderCNN, self).__init__()
resnet = models.resnet152(pretrained=True) # Constructs a ResNet-152 model. 返回在ImageNet上训练好的模型
modules = list(resnet.children())[:-1] # delete the last fc layer. 直接修改需要修改的层
self.resnet = nn.Sequential(*modules) # torch.nn.Sequential快速搭建神经网络
# resnet网络最后一层分类层fc是对1000种类型进行划分
self.linear = nn.Linear(resnet.fc.in_features, embed_size) # nn.linear(输入节点数,输出节点数), #resnet.fc.in_features提取fc层中固定的参数,修改最后一层为embed_size
self.bn = nn.BatchNorm1d(embed_size, momentum=0.01) #
def forward(self, images):
"""Extract feature vectors from input images."""
with torch.no_grad():
features = self.resnet(images)
features = features.reshape(features.size(0), -1)
features = self.bn(self.linear(features))
return features
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
"""Set the hyper-parameters and build the layers."""
super(DecoderRNN, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
self.linear = nn.Linear(hidden_size, vocab_size)
self.max_seg_length = max_seq_length
def forward(self, features, captions, lengths):
"""Decode image feature vectors and generates captions."""
embeddings = self.embed(captions)
embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
hiddens, _ = self.lstm(packed)
outputs = self.linear(hiddens[0])
return outputs
def sample(self, features, states=None):
"""Generate captions for given image features using greedy search."""
sampled_ids = []
inputs = features.unsqueeze(1)
for i in range(self.max_seg_length):
hiddens, states = self.lstm(inputs, states) # hiddens: (batch_size, 1, hidden_size)
outputs = self.linear(hiddens.squeeze(1)) # outputs: (batch_size, vocab_size)
_, predicted = outputs.max(1) # predicted: (batch_size)
sampled_ids.append(predicted)
inputs = self.embed(predicted) # inputs: (batch_size, embed_size)
inputs = inputs.unsqueeze(1) # inputs: (batch_size, 1, embed_size)
sampled_ids = torch.stack(sampled_ids, 1) # sampled_ids: (batch_size, max_seq_length)
return sampled_ids
train:
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def main(args):
# Create model directory
if not os.path.exists(args.model_path):
os.makedirs(args.model_path)
# Image preprocessing, normalization for the pretrained resnet
transform = transforms.Compose([
transforms.RandomCrop(args.crop_size), # randomcrop一个随机的位置进行裁剪
transforms.RandomHorizontalFlip(), # randomhorizantal 以0.5的概率水平翻转给定的PIL图像
transforms.ToTensor(), # totensor :convert a PIL image to tensor (H*W*C) in range [0,255] to a torch.Tensor(C*H*W) in the range [0.0,1.0]
transforms.Normalize((0.485, 0.456, 0.406), # 用均值和标准差对张量图像进行归一化
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open(args.vocab_path, 'rb') as f:
vocab = pickle.load(f)
# Build data loader
data_loader = get_loader(args.image_dir, args.caption_path, vocab,
transform, args.batch_size,
shuffle=True, num_workers=args.num_workers)
# Build the models
encoder = EncoderCNN(args.embed_size).to(device)
decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=args.learning_rate)
# Train the models
total_step = len(data_loader)
for epoch in range(args.num_epochs):
for i, (images, captions, lengths) in enumerate(data_loader):
# Set mini-batch dataset
images = images.to(device)
captions = captions.to(device)
targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
# Forward, backward and optimize
features = encoder(images)
outputs = decoder(features, captions, lengths)
loss = criterion(outputs, targets)
decoder.zero_grad()
encoder.zero_grad()
loss.backward() # 反向传播求梯度
optimizer.step() # 更新所有参数
# Print log info
if i % args.log_step == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
.format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item())))
# Save the model checkpoints
if (i + 1) % args.save_step == 0:
torch.save(decoder.state_dict(), os.path.join(
args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
torch.save(encoder.state_dict(), os.path.join(
args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, default='models/', help='path for saving trained models')
parser.add_argument('--crop_size', type=int, default=224, help='size for randomly cropping images')
parser.add_argument('--vocab_path', type=str, default='data/vocab.pkl', help='path for vocabulary wrapper')
parser.add_argument('--image_dir', type=str, default='data/resized2014', help='directory for resized images')
parser.add_argument('--caption_path', type=str, default='data/captions_train2014.json',
help='path for train annotation json file')
parser.add_argument('--log_step', type=int, default=10, help='step size for prining log info')
parser.add_argument('--save_step', type=int, default=1000, help='step size for saving trained models')
# Model parameters
parser.add_argument('--embed_size', type=int, default=256, help='dimension of word embedding vectors')
parser.add_argument('--hidden_size', type=int, default=512, help='dimension of lstm hidden states')
parser.add_argument('--num_layers', type=int, default=1, help='number of layers in lstm')
parser.add_argument('--num_epochs', type=int, default=5)
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--num_workers', type=int, default=2)
parser.add_argument('--learning_rate', type=float, default=0.001)
args = parser.parse_args()
print(args)
main(args)