一.结构概述
CNN LSTM架构使用CNN做为前端提取输入数据的特征,然后结合LSTM进行序列预测,常见的应用如:
- Activity Recognition
- Image Description
- Video Description
该结构适合的用途为:
1.输入数据有空间结构,如2D结构,图片像素,或如句子中的词语,段落或文档等1D结构。
2.输入数据有时间结构,如视频中的图片顺序,文本中的词语,或者需要输出含有时间结构的数据。
基本结构如图:
实现代码片段:
CNN
cnn = Sequential()
cnn.add(Conv2D(1, (2,2), activation= relu , padding= same , input_shape=(10,10,1)))
cnn.add(MaxPooling2D(pool_size=(2, 2)))
cnn.add(Flatten())
将CNN的输出接到LSTM上需要使用TimeDistributed Layer封装CNN:
model.add(TimeDistributed(...))
model.add(LSTM(...))
model.add(Dense(...))
因此总体实现有两种方式:
- 方式一:用TimeDistributed Layer封装CNN整体
# define CNN model
cnn = Sequential()
cnn.add(Conv2D(...))
cnn.add(MaxPooling2D(...))
cnn.add(Flatten())
# define CNN LSTM model
model = Sequential()
model.add(TimeDistributed(cnn, ...))
model.add(LSTM(..))
model.add(Dense(...))
- 方式二: 将CNN的每一层都用TimeDistributed Layer封装
model = Sequential()
model.add(TimeDistributed(Conv2D(...))
model.add(TimeDistributed(MaxPooling2D(...)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(...))
model.add(Dense(...))
二.应用实例:Moving Square Video Prediction Problem
1.问题描述
产生一组帧,每组内的每一帧在上一帧的基础上绘制一个像素点,每组帧的绘制顺序可以是从左至右或从右至左。模型的任务是给定一组帧,判断是改组帧是从左至右还是从右制作绘制出来的,这是一个many-to-one的预测问题。如下图:
2.生成训练数据
from numpy import zeros
from random import randint
from random import random
from matplotlib import pyplot
# generate the next frame in the sequence
def next_frame(last_step, last_frame, column):
# define the scope of the next step
lower = max (0, last_step-1)
upper = min (last_frame.shape[0]-1, last_step+1)
# choose the row index for the next step
step = randint(lower, upper)
# copy the prior frame
frame = last_frame.copy()
# add the new step
frame[step, column] = 1
return frame, step
# generate a sequence of frames of a dot moving across an image
def build_frames(size):
frames = list()
# create the first frame
frame = zeros((size,size))
step = randint(0, size-1)
# decide if we are heading left or right
right = 1 if random() < 0.5 else 0
col = 0 if right else size-1
frame[step, col] = 1
frames.append(frame)
# create all remaining frames
for i in range(1, size):
col = i if right else size-1-i
frame, step = next_frame(step, frame, col)
frames.append(frame)
return frames, right
# generate sequence of frames
size = 5
frames, right = build_frames(size)
print("right:",("从左至右") if right else ("从右至左") )
# plot all feames
pyplot.figure()
for i in range(size):
# create a grayscale subplot for each frame
pyplot.subplot(1, size, i+1)
pyplot.imshow(frames[i], cmap='Greys')
# turn of the scale to make it cleaer
ax = pyplot.gca()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
# show the plot
pyplot.show()
多次运行程序可以输出不同的序列,如:
3.完整代码–网络结构如下:
- 生成的输入图片:50*50
- CNN参数:卷积层filter:2,kenel:22,stride:1;maxpooling层,filter 22
- LSTM参数:1层LSTM含50 memory cells
- 全连接层激活函数sigmoid,二分类问题(判断图片序列从左到右或从右到左)
- 损失函数:binary_crossentropy
- 优化器:AdamOptimizer
from random import random
from random import randint
from numpy import array
from numpy import zeros
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import TimeDistributed
# configure problem
size = 50
# generate the next frame in the sequence
def next_frame(last_step, last_frame, column):
# define the scope of the next step
lower = max (0, last_step-1)
upper = min (last_frame.shape[0]-1, last_step+1)
# choose the row index for the next step
step = randint(lower, upper)
# copy the prior frame
frame = last_frame.copy()
# add the new step
frame[step, column] = 1
return frame, step
# generate a sequence of frames of a dot moving across an image
def build_frames(size):
frames = list()
# create the first frame
frame = zeros((size,size))
step = randint(0, size-1)
# decide if we are heading left or right
right = 1 if random() < 0.5 else 0
col = 0 if right else size-1
frame[step, col] = 1
frames.append(frame)
# create all remaining frames
for i in range(1, size):
col = i if right else size-1-i
frame, step = next_frame(step, frame, col)
frames.append(frame)
return frames, right
# generate multiple sequences of frames and reshape for network input
def generate_examples(size, n_patterns):
X, y = list(), list()
for _ in range(n_patterns):
frames, right = build_frames(size)
X.append(frames)
y.append(right)
# resize as [samples , timesteps , width , height , channels]
X = array(X).reshape(n_patterns, size, size, size, 1)
y = array(y).reshape(n_patterns, 1)
return X, y
#define the model
model = Sequential()
model.add(TimeDistributed(Conv2D(2,(2,2),activation='relu'),input_shape=(None,size,size,1)))
model.add(TimeDistributed(MaxPooling2D(pool_size=(2,2))))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(50))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
print(model.summary())
可以看到模型结构为:
# fit model,使用5000个序列
X, y = generate_examples(size, 5000)
model.fit(X, y, batch_size=32, epochs=1)
# evaluate model
X, y = generate_examples(size, 100)
loss, acc = model.evaluate(X, y, verbose=0)
print('loss:%f,acc:%f' %(loss,acc*100))
输出:loss:0.001503,acc:100.000000
# prediction on new data
X, y = generate_examples(size, 1)
yhat = model.predict_classes(X, verbose=0)
expected = "Right" if y[0]==1 else "Left"
predicted = "Right" if yhat[0]==1 else "Left"
print('Expected: %s , Predicted : % s ' % (expected, predicted))
输出:Expected: Right , Predicted : Right