这篇文章是将cs231n中LSTM_Caption重新敲了一遍,所有的模块放在一起,以便于系统的理解整个过程。目的是读懂其中的每一行代码,即使是课程中已经帮你写好了的。
# As usual, a bit of setup
from __future__ import print_function
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
自动加载模块
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
1. 下载数据
from builtins import range
import os, json
import numpy as np
import h5py
BASE_DIR = 'cs231n/datasets/coco_captioning'
def load_coco_data(base_dir=BASE_DIR,
max_train=None,
pca_features=True):
"""
从本地磁盘获取数据,生成的数据集包括:(captions, features, URLs, and vocabulary)
- captions:图像标题经过分词后得到的
- features:图像特征
- URLs:图像对应的网络链接。因为图片占内存太大,因而只存储其链接
- vovabulary:词典
输入:
- pca_features: 图像特征是否经过降维处理。图像是通过VGG-16 network在ImageNet训练过后提取的特征。
这些图像特征分别存储在train2014_vgg16_fc7.h5 和 val2014_vgg16_fc7.h5 文件中。通过pca降维
处理后的特征放在train2014_vgg16_fc7_pca.h5 和 val2014_vgg16_fc7_pca.h5文件中。
- max_train:
"""
data = {}
caption_file = os.path.join(base_dir, 'coco2014_captions.h5')
# 获取图像标题数据,注意里面存储的是word对应的Integer ID
# 请记住h5py.File类似python的词典对象,详细查看链接:http://blog.csdn.net/yudf2010/article/details/50353292
with h5py.File(caption_file, 'r') as f:
assert(list(f.keys())==['train_captions', 'train_image_idxs', 'val_captions', 'val_image_idxs'])
for k, v in f.items():
data[k] = np.asarray(v)
# 获取图像特征:
if pca_features:
train_feat_file = os.path.join(base_dir, 'train2014_vgg16_fc7_pca.h5')
else:
train_feat_file = os.path.join(base_dir, 'train2014_vgg16_fc7.h5')
with h5py.File(train_feat_file, 'r') as f:
assert(list(f.keys())==['features'])
data['train_features'] = np.asarray(f['features'])
if pca_features:
val_feat_file = os.path.join(base_dir, 'val2014_vgg16_fc7_pca.h5')
else:
val_feat_file = os.path.join(base_dir, 'val2014_vgg16_fc7.h5')
with h5py.File(val_feat_file, 'r') as f:
data['val_features'] = np.asarray(f['features'])
# 获取词典
# 关于json详细可查看python3-cookbook:http://python3-cookbook.readthedocs.io/zh_CN/latest/c06/p02_read-write_json_data.html
dict_file = os.path.join(base_dir, 'coco2014_vocab.json')
with open(dict_file, 'r') as f:
dict_data = json.load(f)
for k, v in dict_data.items():
data[k] = v
# 获取图像链接
train_url_file = os.path.join(base_dir, 'train2014_urls.txt')
with open(train_url_file, 'r') as f:
train_urls = np.asarray([line.strip() for line in f])
data['train_urls'] = train_urls
val_url_file = os.path.join(base_dir, 'val2014_urls.txt')
with open(val_url_file, 'r') as f:
val_urls = np.asarray([line.strip() for line in f])
data['val_urls'] = val_urls
# 对训练数据进行采样,随机选取图片以及其对应的标题
if max_train is not None: ## max_train表示训练集样本数
num_train = data['train_captions'].shape[0]
mask = np.random.randint(num_train, size=max_train)
data['train_captions'] = data['train_captions'][mask]
data['train_image_idxs'] = data['train_image_idxs'][mask]
return data
data = load_coco_data(BASE_DIR)
for k, v in data.items():
if type(v) == np.ndarray:
print(k, type(v), v.shape, v.dtype)
else:
print(k, type(v), len(v))
train_captions <class 'numpy.ndarray'> (400135, 17) int32
train_image_idxs <class 'numpy.ndarray'> (400135,) int32
val_captions <class 'numpy.ndarray'> (195954, 17) int32
val_image_idxs <class 'numpy.ndarray'> (195954,) int32
train_features <class 'numpy.ndarray'> (82783, 512) float32
val_features <class 'numpy.ndarray'> (40504, 512) float32
idx_to_word <class 'list'> 1004
word_to_idx <class 'dict'> 1004
train_urls <class 'numpy.ndarray'> (82783,) <U63
val_urls <class 'numpy.ndarray'> (40504,) <U63
2. LSTM
2.1 LSTM step forward
这个图是来自Ng的courser课程作业中的截图。虽然Ng的课程作业相比cs231n的作业太简单了,完全的手把手的教。。感觉不太好,很多东西模块化了看不到,不知道具体细节。但是Ng的课讲的确实是好!而且里面的图画的也很好,就借鉴过来了~但符号标注跟cs231n不太一样,不过能理解就没问题!
LSTM相比vanilla RNN,由于重复矩阵乘法导致消失和爆炸梯度,因此vanilla RNN可能很难训练长序列。LSTM通过用选通机制代替vanilla RNN的简单更新规则来解决这个问题。
代码与上图中的符号不太一致。图中隐藏层用 a<t−1>,a<t> a < t − 1 > , a < t > 表示,而在代码中用prev_h和next_h表示。代码中的a表示为经过激活函数的gate.
然后经过激活函数:ai表示更新门 σu σ u , af表示遗忘门 σf σ f , ao表示输出门 σo σ o ,ag要更新的memory cell即 c̃ <t> c ~ < t > ,next_c表示新的memory cell状态。
def sigmoid(x):
"""
A numerically stable version of the logistic sigmoid function.
"""
pos_mask = (x >= 0)
neg_mask = (x < 0)
z = np.zeros_like(x)
z[pos_mask] = np.exp(-x[pos_mask])
z[neg_mask] = np.exp(x[neg_mask]) ### 大于0的加符号,小于0的不加。。为啥?
top = np.ones_like(x)
top[neg_mask] = z[neg_mask]
return top / (1 + z)
def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
"""
Forward pass for a single timestep of an LSTM.
The input data has dimension D, the hidden state has dimension H, and we use
a minibatch size of N.
Inputs:
- x: Input data, of shape (N, D)
- prev_h: Previous hidden state, of shape (N, H)
- prev_c: previous cell state, of shape (N, H)
- Wx: Input-to-hidden weights, of shape (D, 4H)
- Wh: Hidden-to-hidden weights, of shape (H, 4H)
- b: Biases, of shape (4H,)
Returns a tuple of:
- next_h: Next hidden state, of shape (N, H)
- next_c: Next cell state, of shape (N, H)
- cache: Tuple of values needed for backward pass.
"""
next_h, next_c, cache = None, None, None
#############################################################################
# TODO: Implement the forward pass for a single timestep of an LSTM. #
# You may want to use the numerically stable sigmoid implementation above. #
#############################################################################
N, H = prev_h.shape
a = x.dot(Wx) + prev_h.dot(Wh) + b