RNN和LSTM都只能依据之前时刻的时序信息来预测下一时刻的输出,但在有些问题中,当前时刻的输出不仅和之前的状态有关,还可能和未来的状态有关系。比如预测一句话中缺失的单词不仅需要根据前文来判断,还需要考虑它后面的内容,才能真正做到基于上下文判断。双向循环神经网络(BRNN)由两个RNN上下叠加在一起组成的,输出由这两个RNN的状态共同决定,结构图如下所示:
(图片来源于网络,如有侵权,敬请及时告知删除)
从上图可以看出,双向循环神经网络的隐藏层需要保存两个值,一个𝐴参与正向计算,另一个值𝐴′参与反向计算。最终的输出值取决于
和
:
由此,我们得出:正向计算时,隐藏层的值与
有关;反向计算时,隐藏层的值
与
;最终的输出取决于正向和反向计算的加和。
本文基于python+numpy构建了一个含10个单元的BIRNN神经网络,然后采用吴恩达老师深度学习课程的数据集dinos.txt进行训练,并给出了训练结果。
import numpy as np
1 经典双向RNN
1.1 前向传播
# 输入的y为(1,n),即行向量
def softmax(y):
y = np.exp(y)
m = len(y)
y_sum = y.sum(axis=1)
return y/y_sum.reshape(m,1)
# 前向计算,得到隐藏层的 a ,a′和输出
def birnn_forward(x,a_in, ape_in, parameters_birnn):
a_in = a_in.copy()
ww = parameters_birnn['ww']
wu = parameters_birnn['wu']
bb = parameters_birnn['bb']
wv = parameters_birnn['wv']
ape_in = ape_in.copy()
wwpe = parameters_birnn['wwpe']
wupe = parameters_birnn['wupe']
bbpe = parameters_birnn['bbpe']
wvpe = parameters_birnn['wvpe']
bc = parameters_birnn['bc']
m = len(x)
a_next = np.zeros([x.shape[0],wu.shape[1]])
ape_next = np.zeros([x.shape[0],wu.shape[1]])
for i in range(m):
a_next_i = np.tanh(a_in@ww + x[i]@wu + bb)
ape_next_i = np.tanh(ape_in@wwpe + x[m-i-1]@wupe + bbpe)
a_next[i] = a_next_i
ape_next[m-i-1] = ape_next_i
a_in = a_next_i
ape_in = ape_next_i
y_hat = softmax(a_next@wv + ape_next@wvpe + bc)
return a_next, ape_next, y_hat
# 验证前向计算
x = np.random.randn(10,27)
a_in = np.random.randn(1,5)
ww = np.random.randn(5,5)
wu = np.random.randn(27,5)
bb = np.random.randn(1,5)
ape_in = np.random.randn(1,5)
wwpe = np.random.randn(5,5)
wupe = np.random.randn(27,5)
bbpe = np.random.randn(1,5)
wv = np.random.randn(5,27)
wvpe = np.random.randn(5,27)
bc = np.random.randn(1,27)
parameters_birnn = {"ww":ww,"wu":wu,"bb":bb, "wwpe":wwpe,"wupe":wupe,"bbpe":bbpe, "wv":wv,"wvpe":wvpe,"bc":bc}
a_next, ape_next, y_hat = birnn_forward(x,a_in, ape_in, parameters_birnn)
1.2 梯度
BIRNN梯度的计算也分为正向和反向两个部分,正向部分的梯度计算与传统的RNN梯度计算完全相同,反向部分的梯度计算如下图:
def birnn_gradient(x,a_in,ape_in,parameters_birnn,a_next,ape_next,y_hat,y):
#**** 计算正向循环的梯度
a_in = a_in.copy()
ww = parameters_birnn['ww']
wu = parameters_birnn['wu']
wv = parameters_birnn['wv']
bb = parameters_birnn['bb']
bc = parameters_birnn['bc']
a = np.r_[a_in,a_next] # a_in为第0个元素,a中有11行,其最大索引值为10; y_pred中共有10行,其最大索引值为9
m = len(x)-1 # m=9
# 0-9,共10个隐藏层,先计算最后一个隐藏层,即第9层的梯度
delt_frist = (y_hat[m]-y[m]).reshape(1,-1) # 隐藏层的初始误差
delt_v = (a[m+1].reshape(1,-1)).T @ delt_frist
delt_c = delt_frist
delt_a = delt_frist @ wv.T
delt_w = (a[m].reshape(1,-1)).T @ (delt_a*(1-a[m+1]**2))
delt_u = (x[m].reshape(1,-1)).T @ (delt_a*(1-a[m+1]**2))
delt_b = delt_a * (1-a[m+1]**2)
delt_a_back = (delt_a*(1-a[m+1]**2)) @ ww.T
# 计算隐藏层0-8的梯度,m=9
for i in range(m):
delt_frist = (y_hat[m-1-i]-y[m-1-i]).reshape(1,-1)
delt_v += (a[m-i].reshape(1,-1)).T @ delt_frist
delt_c += delt_frist
delt_a = delt_frist @ wv.T + delt_a_back
delt_w += (a[m-1-i].reshape(1,-1)).T @ (delt_a * (1-a[m-i]**2))
delt_u += (x[m-1-i].reshape(1,-1)).T @ (delt_a * (1-a[m-i]**2))
delt_b += delt_a * (1-a[m-i]**2)
delt_a_back = (delt_a*(1-a[m-i]**2)) @ ww.T
#**** 计算反向循环的梯度
ape_in = ape_in.copy()
wwpe = parameters_birnn['wwpe']
wupe = parameters_birnn['wupe']
wvpe = parameters_birnn['wvpe']
bbpe = parameters_birnn['bbpe']
ape = np.r_[ape_next,ape_in] # aape_in是最底层的元素
# 0-9,共10个隐藏层,先计算第0层的梯度
delt_frist = (y_hat[0]-y[0]).reshape(1,-1) # 隐藏层的初始误差
delt_vpe = (ape[0].reshape(1,-1)).T @ delt_frist
delt_ape = delt_frist @ wvpe.T
delt_wpe = (ape[1].reshape(1,-1)).T @ (delt_ape*(1-ape[0]**2))
delt_upe = (x[0].reshape(1,-1)).T @ (delt_ape*(1-a[0]**2))
delt_bpe = delt_ape * (1-ape[0]**2)
delt_ape_back = (delt_ape*(1-ape[0]**2)) @ wwpe.T
# 计算隐藏层1-9的梯度
for j in range(1,len(x)):
delt_frist = (y_hat[j]-y[j]).reshape(1,-1)
delt_vpe += (ape[j].reshape(1,-1)).T @ delt_frist
delt_ape = delt_frist @ wvpe.T + delt_ape_back
delt_wpe += (ape[j+1].reshape(1,-1)).T @ (delt_ape * (1-ape[j]**2))
delt_upe += (x[j].reshape(1,-1)).T @ (delt_ape * (1-ape[j]**2))
delt_bpe += delt_ape * (1-ape[j]**2)
delt_ape_back = (delt_ape*(1-ape[j]**2)) @ wwpe.T
parameters_birnn_gradient = {'delt_w':delt_w, 'delt_u':delt_u, 'delt_b':delt_b,
'delt_wpe':delt_wpe, 'delt_upe':delt_upe, 'delt_bpe':delt_bpe,
'delt_v':delt_v, 'delt_vpe':delt_vpe, 'delt_c':delt_c }
return parameters_birnn_gradient
# 验证梯度
y = np.random.randn(10,27)
parameters_birnn_gradient = birnn_gradient(x,a_in,ape_in,parameters_birnn,a_next,ape_next,y_hat,y)
2 双向循环神经网络训练(生成新的恐龙名字)
2.1 名称及预处理
# 获取名称
data = open("dinos.txt","r").read()
# 转化为小写字符
data = data.lower()
# 转化为无序且不重复的元素列表
chars = list(set(data))
# 获取大小信息
data_size, vocab_size = len(data), len(chars)
print(sorted(chars))
print(f"共计有{data_size}个字符,唯一字符有{vocab_size}个 ")
输出:
['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 共计有19909个字符,唯一字符有27个
# 创建一个字典,每个字符映射到0-26个索引;然后再创建一个字典,该字典将每个索引映射回相应的字符
char_to_ix = {ch:i for i, ch in enumerate(sorted(chars))}
ix_to_char = {i:ch for i, ch in enumerate(sorted(chars))}
print(char_to_ix)
print(ix_to_char)
输出:
{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26} {0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
# 梯度修剪(避免梯度爆炸)
def clip_birnn(parameters_rnn_gradient, maxValue):
delt_w = parameters_birnn_gradient['delt_w']
delt_u = parameters_birnn_gradient['delt_u']
delt_b = parameters_birnn_gradient['delt_b']
delt_wpe = parameters_birnn_gradient['delt_wpe']
delt_upe = parameters_birnn_gradient['delt_upe']
delt_bpe = parameters_birnn_gradient['delt_bpe']
delt_v = parameters_birnn_gradient['delt_v']
delt_vpe = parameters_birnn_gradient['delt_vpe']
delt_c = parameters_birnn_gradient['delt_c']
for gradient in [delt_w,delt_u,delt_b, delt_wpe,delt_upe,delt_bpe, delt_v,delt_vpe,delt_c]:
np.clip(gradient, -maxValue, maxValue, out=gradient)
parameters_birnn_gradient_clipped = {'delt_w':delt_w, 'delt_u':delt_u, 'delt_b':delt_b,
'delt_wpe':delt_wpe, 'delt_upe':delt_upe, 'delt_bpe':delt_bpe,
'delt_v':delt_v, 'delt_vpe':delt_vpe, 'delt_c':delt_c}
return parameters_birnn_gradient_clipped
2.2 自定义梯度下降
# 一个样本的损失函数
def lost_sample(y_hat,y):
z = np.sum(y_hat*y,axis=1)
return -np.sum(np.log(z))
# 输入的examples是字符串列表
def gradient_descent_birnn(examples,vocab_size,a_in,ape_in,parameters_birnn,maxValue=2,learning_rate=0.01,iters=500):
a_in = a_in.copy()
ape_in = ape_in.copy()
n = len(examples) # 样本总数(1536个恐龙的名字)
ww = parameters_birnn['ww']
wu = parameters_birnn['wu']
bb = parameters_birnn['bb']
wwpe = parameters_birnn['wwpe']
wupe = parameters_birnn['wupe']
bbpe = parameters_birnn['bbpe']
wv = parameters_birnn['wv']
wvpe = parameters_birnn['wvpe']
bc = parameters_birnn['bc']
loss = []
for i in range(iters):
loss_samples = 0
# 一次遍历所有的样本
for j in range(n):
# 定义一个训练样本,将样本转化为输入矩阵(X_input)和标签矩阵(y)
x = [char_to_ix[ch] for ch in examples[j]] # 得到一个恐龙名字字符对应数字的列表[1, 1, 3, 8, 5, 14, 15, 19]
# 训练样本转化为0ne-hot形式
m = len(x)
x_k = np.zeros((m,vocab_size))
for idx,val in np.ndenumerate(x):
x_k[idx,val] = 1
x_start = np.zeros((1,vocab_size)) #t=0, x0=0
x_input = np.r_[x_start,x_k]
y_end = np.zeros((1,vocab_size))
y_end[0,0] = 1 #y的结束字符"\n"
y = np.r_[x_k,y_end]
a_next, ape_next, y_hat = birnn_forward(x_input, a_in, ape_in, parameters_birnn) # 前向计算
parameters_birnn_gradient = birnn_gradient(x_input,a_in,ape_in,parameters_birnn,a_next,ape_next,y_hat,y) #得到梯度
'''
parameters_birnn_gradient_clipped = clip_birnn(parameters_birnn_gradient, maxValue) # 梯度修剪
ww -= learning_rate*parameters_birnn_gradient_clipped['delt_w']
wu -= learning_rate*parameters_birnn_gradient_clipped['delt_u']
bb -= learning_rate*parameters_birnn_gradient_clipped['delt_b']
wwpe -= learning_rate*parameters_birnn_gradient_clipped['delt_wpe']
wupe -= learning_rate*parameters_birnn_gradient_clipped['delt_upe']
bbpe -= learning_rate*parameters_birnn_gradient_clipped['delt_bpe']
wv -= learning_rate*parameters_birnn_gradient_clipped['delt_v']
wvpe -= learning_rate*parameters_birnn_gradient_clipped['delt_vpe']
bc -= learning_rate*parameters_birnn_gradient_clipped['delt_c']
'''
ww -= learning_rate*parameters_birnn_gradient['delt_w']
wu -= learning_rate*parameters_birnn_gradient['delt_u']
bb -= learning_rate*parameters_birnn_gradient['delt_b']
wwpe -= learning_rate*parameters_birnn_gradient['delt_wpe']
wupe -= learning_rate*parameters_birnn_gradient['delt_upe']
bbpe -= learning_rate*parameters_birnn_gradient['delt_bpe']
wv -= learning_rate*parameters_birnn_gradient['delt_v']
wvpe -= learning_rate*parameters_birnn_gradient['delt_vpe']
bc -= learning_rate*parameters_birnn_gradient['delt_c']
parameters_birnn = {"ww":ww,"wu":wu,"bb":bb, "wwpe":wwpe,"wupe":wupe,"bbpe":bbpe, "wv":wv,"wvpe":wvpe,"bc":bc}
loss_samples += lost_sample(y_hat,y)
loss_samples = loss_samples/n #一次迭代的平均损失值
loss.append(loss_samples)
return parameters_birnn, loss
3 训练过程
# 按行读入所有的数据(txt文件的每一行是一个恐龙的名字;examples是一个列表,每个元素是一个恐龙的名字)
with open('dinos.txt') as f:
examples = f.readlines()
examples = [x.lower().strip() for x in examples]
# 初始化并训练网络
np.random.seed(0)
np.random.shuffle(examples)
a_in = np.random.randn(1,5)
ww = np.random.randn(5,5)
wu = np.random.randn(27,5)
bb = np.random.randn(1,5)
ape_in = np.random.randn(1,5)
wwpe = np.random.randn(5,5)
wupe = np.random.randn(27,5)
bbpe = np.random.randn(1,5)
wv = np.random.randn(5,27)
wvpe = np.random.randn(5,27)
bc = np.random.randn(1,27)
parameters_birnn = {"ww":ww,"wu":wu,"bb":bb, "wwpe":wwpe,"wupe":wupe,"bbpe":bbpe, "wv":wv,"wvpe":wvpe,"bc":bc}
parameters_birnn_last, loss = gradient_descent_birnn(examples,vocab_size,a_in,ape_in,parameters_birnn,iters=50)
# 图示化训练结果
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6),dpi=80)
plt.plot(range(50),loss)
plt.grid()
plt.show()