本文在学习吴恩达老师深度学习课程-“序列模型”的基础上,使用python+numpy实现RNN、LSTM网络,利用课程中的数据集dinos.txt完成了课后作业。
1 经典的RNN
(图片来源于网络,如有侵权,请及时告知删除)
在每个时间步,RNN会预测给定字符的下一个字符是什么。数据集是一个列表类型的字符训练集,同时在每个时间步亦是如此,且。
1.1 RNN前向传播
循环神经网络可以看作是单元的重复,首先要实现单个时间步的计算。输入与(包含过去信息的上一隐藏层的激活值),输出给下一个RNN单元,也生成来预测。这由以下几步来完成:
1. 使用tanh函数计算隐藏单元的激活值:;
2. 使用,得到:;
3. 存储;
我们将向量化个样本。因此,的维度为,的维度为。
import numpy as np
# 输入的y为(1,n),即行向量或者二维矩阵
def softmax(y):
y = np.exp(y)
m = len(y)
y_sum = y.sum(axis=1)
return y/y_sum.reshape(m,1)
# 前向计算,得到隐藏层的 a 和输出
def rnn_forward(X, a_ini, parameters_rnn):
a_ini = a_ini.copy()
Ww = parameters_rnn['Ww']
Wu = parameters_rnn['Wu']
Wv = parameters_rnn['Wv']
bb = parameters_rnn['bb']
bc = parameters_rnn['bc']
m = len(X)
a_next = np.zeros([X.shape[0],Wu.shape[1]])
y_hat = np.zeros(X.shape)
for i in range(m):
a_next_i = np.tanh(a_ini@Ww + X[i]@Wu + bb)
y_hat_i = softmax(a_next_i@Wv + bc)
a_next[i] = a_next_i
y_hat[i] = y_hat_i
a_ini = a_next_i
return a_next, y_hat
1.2 RNN的梯度
本文采用10个单元,梯度计算过程如下图:
def rnn_gradient(X, a_ini, parameters_rnn, a_next, y_hat, y):
a_ini = a_ini.copy()
Ww = parameters_rnn['Ww']
Wu = parameters_rnn['Wu']
Wv = parameters_rnn['Wv']
bb = parameters_rnn['bb']
bc = parameters_rnn['bc']
a = np.r_[a_ini,a_next] # a中有11行,其最大索引值为10; y_pred中共有10行,其最大索引值为9
m = len(X)-1 # m=9
# 0-9,共10个隐藏层,先计算最后一个隐藏层,即第9层的梯度
delt_frist = (y_hat[m]-y[m]).reshape(1,-1) # 隐藏层的初始误差
delt_v = (a[m+1].reshape(1,-1)).T @ delt_frist
delt_c = delt_frist
delt_a = delt_frist @ Wv.T
delt_w = (a[m].reshape(1,-1)).T @ (delt_a*(1-a[m+1]**2))
delt_u = (X[m].reshape(1,-1)).T @ (delt_a*(1-a[m+1]**2))
delt_b = delt_a * (1-a[m+1]**2)
delt_a_back = (delt_a*(1-a[m+1]**2)) @ Ww.T
# 计算隐藏层0-8的梯度,m=9
for i in range(m):
delt_frist = (y_hat[m-1-i]-y[m-1-i]).reshape(1,-1)
delt_v += (a[m-i].reshape(1,-1)).T @ delt_frist
delt_c += delt_frist
delt_a = delt_frist @ Wv.T + delt_a_back
delt_w += (a[m-1-i].reshape(1,-1)).T @ (delt_a * (1-a[m-i]**2))
delt_u += (X[m-1-i].reshape(1,-1)).T @ (delt_a * (1-a[m-i]**2))
delt_b += delt_a * (1-a[m-i]**2)
delt_a_back = (delt_a*(1-a[m-i]**2)) @ Ww.T
parameters_rnn_gradient = {'delt_w':delt_w, 'delt_u':delt_u, 'delt_v':delt_v, 'delt_b':delt_b, 'delt_c':delt_c }
return parameters_rnn_gradient
2 长短时记忆网络(Long Short-Term Memory (LSTM))
2.1 LSTM基础
LSTM单元的结构如下:
(图片来源于网络,如有侵权,请及时告知删除)
(1)LSTM对输入的4个处理函数:
(2)LSTM的更新:
(3)LSTM的输出:
2.2 LSTM前向计算
def softmax(y):
y = np.exp(y)
m = len(y)
y_sum = y.sum(axis=1)
return y/y_sum.reshape(m,1)
def sigmoid(x):
return 1/(1+np.exp(-x))
# 前向计算,得到隐藏层的 a 和输出
def lstm_forward(X, c_prev, h_prev, parameters_lstm):
wf = parameters_lstm['wf']
uf = parameters_lstm['uf']
bf = parameters_lstm['bf']
wi = parameters_lstm['wi']
ui = parameters_lstm['ui']
bi = parameters_lstm['bi']
wa = parameters_lstm['wa']
ua = parameters_lstm['ua']
ba = parameters_lstm['ba']
wo = parameters_lstm['wo']
uo = parameters_lstm['uo']
bo = parameters_lstm['bo']
vv = parameters_lstm['vv']
bc = parameters_lstm['bc']
m = len(X)
c_next = np.zeros([X.shape[0], uf.shape[1]])
h_next = np.zeros([X.shape[0], wf.shape[1]])
y_pred = np.zeros(X.shape)
F = np.zeros([X.shape[0], uf.shape[1]])
I = np.zeros([X.shape[0], uf.shape[1]])
A = np.zeros([X.shape[0], uf.shape[1]])
O = np.zeros([X.shape[0], uf.shape[1]])
for i in range(m):
ft = sigmoid(h_prev@wf + X[i]@uf + bf)
it = sigmoid(h_prev@wi + X[i]@ui + bi)
at = np.tanh(h_prev@wa + X[i]@ua + ba)
ot = sigmoid(h_prev@wo + X[i]@uo + bo)
c_next_i = c_prev*ft + it*at
h_next_i = ot * np.tanh(c_next_i)
y_pred_i = softmax(h_next_i@vv + bc)
F[i],I[i],A[i],O[i] = ft,it,at,ot
c_next[i] = c_next_i
h_next[i] = h_next_i
y_pred[i] = y_pred_i
c_prev = c_next_i
h_prev = h_next_i
return F,I,A,O, c_next, h_next, y_pred
2.3 lstm的梯度
def lstm_gradient(X, c_prev, h_prev, parameters_lstm, F,I,A,O, c_next, h_next, y_pred, y):
wf = parameters_lstm['wf']
uf = parameters_lstm['uf']
bf = parameters_lstm['bf']
wi = parameters_lstm['wi']
ui = parameters_lstm['ui']
bi = parameters_lstm['bi']
wa = parameters_lstm['wa']
ua = parameters_lstm['ua']
ba = parameters_lstm['ba']
wo = parameters_lstm['wo']
uo = parameters_lstm['uo']
bo = parameters_lstm['bo']
vv = parameters_lstm['vv']
bc = parameters_lstm['bc']
c = np.r_[c_prev,c_next] # c、h中有11行,其最大索引值为10; x、y_pred中共有10行,其最大索引值为9
h = np.r_[h_prev,h_next]
m = len(X)-1 # m=9
# 0-9,共10个隐藏层,先计算最后一个隐藏层,即第9层的梯度
delt_frist = (y_pred[m]-y[m]).reshape(1,-1) # 隐藏层的初始误差
delt_vv = (h[m+1].reshape(1,-1)).T @ delt_frist
delt_bc = delt_frist
delt_h = delt_frist @ vv.T
delt_o = delt_h * np.tanh(c[m+1])
delt_wo = (h[m].reshape(1,-1)).T @ (delt_o*O[m]*(1-O[m])).reshape(1,-1)
delt_uo = (X[m].reshape(1,-1)).T @ (delt_o * O[m]*(1-O[m])).reshape(1,-1)
delt_bo = (delt_o * O[m]*(1-O[m])).reshape(1,-1)
delt_c = delt_h * O[m] * (1-np.tanh(c[m+1])**2)
delt_f = delt_c * c[m]
delt_wf = (h[m].reshape(1,-1)).T @ (delt_f*F[m]*(1-F[m])).reshape(1,-1)
delt_uf = (X[m].reshape(1,-1)).T @ (delt_f*F[m]*(1-F[m])).reshape(1,-1)
delt_bf = (delt_f*F[m]*(1-F[m])).reshape(1,-1)
delt_i = delt_c * A[m]
delt_wi = (h[m].reshape(1,-1)).T @ (delt_i*I[m]*(1-I[m])).reshape(1,-1)
delt_ui = (X[m].reshape(1,-1)).T @ (delt_i*I[m]*(1-I[m])).reshape(1,-1)
delt_bi = (delt_i*I[m]*(1-I[m])).reshape(1,-1)
delt_a = delt_c * I[m]
delt_wa = (h[m].reshape(1,-1)).T @ (delt_a*(1-A[m]**2)).reshape(1,-1)
delt_ua = (X[m].reshape(1,-1)).T @ (delt_a*(1-A[m]**2)).reshape(1,-1)
delt_ba = (delt_a*(1-A[m]**2)).reshape(1,-1)
delt_c_back = delt_c * F[m]
delt_h_back = (delt_f*F[m]*(1-F[m])).reshape(1,-1)@wf.T + (delt_i*I[m]*(1-I[m])).reshape(1,-1)@wi.T + \
(delt_a*(1-A[m]**2)).reshape(1,-1)@wa.T + (delt_o * O[m]*(1-O[m])).reshape(1,-1)@wo.T
for i in range(m): # i=0,1,2,...,8; m=9
delt_frist = (y_pred[m-1-i]-y[m-1-i]).reshape(1,-1)
delt_vv += (h[m-i].reshape(1,-1)).T @ delt_frist
delt_bc += delt_frist
delt_h = delt_frist @ vv.T + delt_h_back
delt_o = delt_h * np.tanh(c[m-i])
delt_wo += (h[m-1-i].reshape(1,-1)).T @ (delt_o*O[m-1-i]*(1-O[m-1-i])).reshape(1,-1)
delt_uo += (X[m-1-i].reshape(1,-1)).T @ (delt_o*O[m-1-i]*(1-O[m-1-i])).reshape(1,-1)
delt_bo += (delt_o*O[m-1-i]*(1-O[m-1-i])).reshape(1,-1)
delt_c = delt_h * O[m-1-i] * (1-np.tanh(c[m-i])**2) + delt_c_back
delt_f = delt_c * c[m-1-i]
delt_wf += (h[m-1-i].reshape(1,-1)).T @ (delt_f*F[m-1-i]*(1-F[m-1-i])).reshape(1,-1)
delt_uf += (X[m-1-i].reshape(1,-1)).T @ (delt_f*F[m-1-i]*(1-F[m-1-i])).reshape(1,-1)
delt_bf += (delt_f*F[m-1-i]*(1-F[m-1-i])).reshape(1,-1)
delt_i = delt_c * A[m-1-i]
delt_wi += (h[m-1-i].reshape(1,-1)).T @ (delt_i*I[m-1-i]*(1-I[m-1-i])).reshape(1,-1)
delt_ui += (X[m-1-i].reshape(1,-1)).T @ (delt_i*I[m-1-i]*(1-I[m-1-i])).reshape(1,-1)
delt_bi += (delt_i*I[m-1-i]*(1-I[m-1-i])).reshape(1,-1)
delt_a = delt_c * I[m-1-i]
delt_wa += (h[m-1-i].reshape(1,-1)).T @ (delt_a*(1-A[m-1-i]**2)).reshape(1,-1)
delt_ua += (X[m-1-i].reshape(1,-1)).T @ (delt_a*(1-A[m-1-i]**2)).reshape(1,-1)
delt_ba += (delt_a*(1-A[m-1-i]**2)).reshape(1,-1)
delt_c_back = delt_c * F[m-1-i]
delt_h_back = (delt_f*F[m-1-i]*(1-F[m-1-i])).reshape(1,-1)@wf.T + (delt_i*I[m-1-i]*(1-I[m-1-i])).reshape(1,-1)@wi.T + \
(delt_a*(1-A[m-1-i]**2)).reshape(1,-1)@wa.T + (delt_o*O[m-1-i]*(1-O[m-1-i])).reshape(1,-1)@wo.T
parameters_lstm_gradient = {'delt_wf':delt_wf,'delt_uf':delt_uf,'delt_bf':delt_bf,
'delt_wi':delt_wi,'delt_ui':delt_ui,'delt_bi':delt_bi,
'delt_wa':delt_wa,'delt_ua':delt_ua,'delt_ba':delt_ba,
'delt_wo':delt_wo,'delt_uo':delt_uo,'delt_bo':delt_bo,
'delt_vv':delt_vv, 'delt_bc':delt_bc}
return parameters_lstm_gradient
3 字符级语言模型--恐龙的名字
课程作业已经给出了含所有的恐龙名字的数据集。为了构建字符级语言模型来生成新的名称,将建立模型来学习不同的名称模式,并随机生成新的名字。下面将:
- 存储文本数据以便使用RNN进行处理;
- 合成数据,通过每次采样预测,并将其传递给下一个RNN单元;
- 构建字符级文本生成循环神经网络;
- 执行梯度修剪。
3.1 数据集与预处理
首先读取恐龙名称的数据集,创建一个唯一字符列表(如AZ),并计算数据集和词汇量大小。
# 获取名称
data = open("dinos.txt","r").read() # 逐个字节或者字符读取文件中所有恐龙的名字
# 转化为小写字符
data = data.lower()
# 转化为无序且不重复的元素列表
chars = list(set(data))
# 获取大小信息
data_size, vocab_size = len(data), len(chars)
print(chars)
print(f"共计有{data_size}个字符,唯一字符有{vocab_size}个 ")
输出:
['a', 's', 'h', 'm', 'g', 'y', 'o', 'v', 'q', 'w', 'e', 'l', 'u', 'f', 'r', 'i', 'k', 'b', 'z', 'c', 'p', 't', '\n', 'd', 'j', 'x', 'n'] 共计有19909个字符,唯一字符有27个
这里有26个字符加上“\n”,其中的“\n”(换行符)起到类似于句子中的EOS(句子结尾)的作用,表示了名字的结尾。
下面,我们将创建一个字典,每个字符映射到0-26个索引;然后再创建一个字典,该字典将每个索引映射回相应的字符,它会帮助我们找出softmax层的概率分布输出中的字符。
char_to_ix = {ch:i for i, ch in enumerate(sorted(chars))}
ix_to_char = {i:ch for i, ch in enumerate(sorted(chars))}
print(char_to_ix)
print(ix_to_char)
输出:{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26} {0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
3.2 梯度修剪和采样模块
3.2.1 梯度修剪
为避免梯度爆炸,定义𝑐𝑙𝑖𝑝clip梯度修剪函数,梯度向量的每一个元素都被限制在[−𝑁,𝑁]的范围,如果梯度大于𝑁,将取值为𝑁;如果小于−𝑁,将取值为−𝑁;否则不变。
def clip_rnn(parameters_rnn_gradient, maxValue):
delt_w = parameters_rnn_gradient['delt_w']
delt_u = parameters_rnn_gradient['delt_u']
delt_v = parameters_rnn_gradient['delt_v']
delt_b = parameters_rnn_gradient['delt_b']
delt_c = parameters_rnn_gradient['delt_c']
for gradient in [delt_w,delt_u,delt_v,delt_b,delt_c]:
np.clip(gradient, -maxValue, maxValue, out=gradient)
parameters_rnn_gradient_clipped = {'delt_w':delt_w, 'delt_u':delt_u, 'delt_v':delt_v, 'delt_b':delt_b, 'delt_c':delt_c }
return parameters_rnn_gradient_clipped
def clip_lstm(parameters_lstm_gradient, maxValue):
delt_wf = parameters_lstm_gradient['delt_wf']
delt_uf = parameters_lstm_gradient['delt_uf']
delt_bf = parameters_lstm_gradient['delt_bf']
delt_wi = parameters_lstm_gradient['delt_wi']
delt_ui = parameters_lstm_gradient['delt_ui']
delt_bi = parameters_lstm_gradient['delt_bi']
delt_wa = parameters_lstm_gradient['delt_wa']
delt_ua = parameters_lstm_gradient['delt_ua']
delt_ba = parameters_lstm_gradient['delt_ba']
delt_wo = parameters_lstm_gradient['delt_wo']
delt_uo = parameters_lstm_gradient['delt_uo']
delt_bo = parameters_lstm_gradient['delt_bo']
delt_vv = parameters_lstm_gradient['delt_vv']
delt_bc = parameters_lstm_gradient['delt_bc']
for gradient in [delt_wf,delt_uf,delt_bf,
delt_wi,delt_ui,delt_bi,
delt_wa,delt_ua,delt_ba,
delt_wo,delt_uo,delt_bo, delt_vv, delt_bc]:
np.clip(gradient, -maxValue, maxValue, out=gradient)
parameters_lstm_gradient_clipped = {'delt_wf':delt_wf,'delt_uf':delt_uf,'delt_bf':delt_bf,
'delt_wi':delt_wi,'delt_ui':delt_ui,'delt_bi':delt_bi,
'delt_wa':delt_wa,'delt_ua':delt_ua,'delt_ba':delt_ba,
'delt_wo':delt_wo,'delt_uo':delt_uo,'delt_bo':delt_bo,
'delt_vv':delt_vv, 'delt_bc':delt_bc}
return parameters_lstm_gradient_clipped
3.2.2 采样
假设模型已经训练过了,我们希望生成新的文本,生成的过程如下图:
在上图中,对于经过训练的RNN模型,我们用sample函数来进行采样。它分成四个步骤:
- 步骤1:网络第一个输入,这是在生成字符这前的输入,同时我们设置;
- 步骤2:运行一次前向传播,然后得到和。公式如下:
- 步骤3:采样,根据指定的概率分布选择下一个字符的索引。假设,那么选择索引的概率为0.16,我们使用𝑛𝑝.𝑟𝑎𝑛𝑑𝑜𝑚.𝑐ℎ𝑜𝑖𝑐𝑒函数来实现这个功能。
- 用采样得到的作为输入,循环2、3,直到'\n'字符,表明已经达到恐龙名称的末尾。
(1) RNN 采样
def sample_rnn(parameters_rnn,a_ini,vocab_size,char_to_ix):
"""
根据RNN输出的概率分布序列,对字符序列进行采样
参数:
parameters_rnn -- 包含了Ww,Wu,Wv,bb,bc的字典
char_to_ix -- 字符映射到索引的字典
seed -- 随机种子
返回:
indices -- 包含采样字符索引的长度为n的列表。
"""
# 从parameters_rnn中获取参数
Ww = parameters_rnn['Ww']
Wu = parameters_rnn['Wu']
Wv = parameters_rnn['Wv']
bb = parameters_rnn['bb']
bc = parameters_rnn['bc']
x = np.zeros((1, vocab_size))
indices = []
idx = -1
counter = 0
newline_character = char_to_ix["\n"]
while(idx != newline_character and counter<50):
a = np.tanh(a_ini@Ww + x@Wu + bb)
y_pred = softmax(a@Wv + bc)
#idx = np.argmax(y_pred)
np.random.seed(counter)
idx = np.random.choice(list(range(vocab_size)), p=y_pred.ravel())
indices.append(idx)
x = np.zeros((1, vocab_size))
x[0,idx] = 1
a_prev = a
counter += 1
if (counter ==50):
indices.append(char_to_ix["\n"])
return indices
(2) LSTM采样
def sample_lstm(parameters_lstm,c_prev,h_prev,vocab_size,char_to_ix):
"""
根据Lstm输出的概率分布序列,对字符序列进行采样
参数:
parameters_lstm -- 包含了f,i,a,o,vv,bc共14个参数的字典
char_to_ix -- 字符映射到索引的字典
返回:
indices -- 包含采样字符索引的长度为n的列表。
"""
# 从parameters_lstm中获取参数
wf = parameters_lstm['wf']
uf = parameters_lstm['uf']
bf = parameters_lstm['bf']
wi = parameters_lstm['wi']
ui = parameters_lstm['ui']
bi = parameters_lstm['bi']
wa = parameters_lstm['wa']
ua = parameters_lstm['ua']
ba = parameters_lstm['ba']
wo = parameters_lstm['wo']
uo = parameters_lstm['uo']
bo = parameters_lstm['bo']
vv = parameters_lstm['vv']
bc = parameters_lstm['bc']
x = np.zeros((1, vocab_size))
indices = []
idx = -1
counter = 0
newline_character = char_to_ix["\n"]
while(idx != newline_character and counter<50):
ft = sigmoid(h_prev@wf + x@uf + bf)
it = sigmoid(h_prev@wi + x@ui + bi)
at = np.tanh(h_prev@wa + x@ua + ba)
ot = sigmoid(h_prev@wo + x@uo + bo)
c_next = c_prev*ft + it*at
h_next = ot * np.tanh(c_next)
y_pred = softmax(h_next@vv + bc)
np.random.seed(counter)
idx = np.random.choice(list(range(vocab_size)), p=y_pred.ravel())
indices.append(idx)
x = np.zeros((1, vocab_size))
x[0,idx] = 1
h_prev = h_next
c_prev = c_next
counter += 1
if (counter ==50):
indices.append(char_to_ix["\n"])
return indices
4 训练过程
4.1 RNN的训练
在这里,我们将实现一个随机梯度下降函数(带梯度修剪)。过程:
- 前向计算损失;
- 反向传播计算关于参数的梯度损失;
- 修剪梯度;
- 使用梯度下降更新参数。
# 一个样本的损失函数
def lost_sample(y_pred,y):
z = np.sum(y_pred*y,axis=1)
return -np.sum(np.log(z))
# 输入的examples是字符串列表
def gradient_descent_rnn(examples,vocab_size,a_ini,parameters_rnn,maxValue=5,learning_rate=0.01,iters=500):
a_ini = a_ini.copy()
n = len(examples) # 样本总数
Ww = parameters_rnn['Ww']
Wu = parameters_rnn['Wu']
Wv = parameters_rnn['Wv']
bb = parameters_rnn['bb']
bc = parameters_rnn['bc']
loss = []
for i in range(iters):
loss_samples = 0
# 遍历一次所有的样本
for j in range(n):
# 定义一个训练样本,将样本转化为输入矩阵(X_input)和标签矩阵(y)
X = [char_to_ix[ch] for ch in examples[j]] # 得到一个恐龙名字字符对应数字的列表[1, 1, 3, 8, 5, 14, 15, 19]
# 训练样本转化为0ne-hot形式
m = len(X)
X_k = np.zeros((m,vocab_size))
for idx,val in np.ndenumerate(X):
X_k[idx,val] = 1
x_start = np.zeros((1,vocab_size)) #t=0, X0=0
X_input = np.r_[x_start,X_k]
y_end = np.zeros((1,vocab_size))
y_end[0,0] = 1 #y的结束字符"\n"
y = np.r_[X_k,y_end]
a_next, y_hat = rnn_forward(X_input,a_ini,parameters_rnn) # 前向计算
parameters_rnn_gradient = rnn_gradient(X_input,a_ini,parameters_rnn,a_next,y_hat,y) #得到梯度
parameters_rnn_gradient_clipped = clip_rnn(parameters_rnn_gradient, maxValue) # 梯度修剪
Ww -= learning_rate*parameters_rnn_gradient_clipped['delt_w']
Wu -= learning_rate*parameters_rnn_gradient_clipped['delt_u']
Wv -= learning_rate*parameters_rnn_gradient_clipped['delt_v']
bb -= learning_rate*parameters_rnn_gradient_clipped['delt_b']
bc -= learning_rate*parameters_rnn_gradient_clipped['delt_c']
parameters_rnn = {'Ww':Ww, "Wu":Wu, "Wv":Wv, 'bb':bb, 'bc':bc}
loss_samples += lost_sample(y_hat,y)
loss_samples = loss_samples/n #一次迭代的平均损失值
loss.append(loss_samples)
return parameters_rnn, loss
# 初始化参数并训练
with open('dinos.txt') as f:
examples = f.readlines()
examples = [x.lower().strip() for x in examples]
np.random.seed(0)
np.random.shuffle(examples)
a_ini = np.random.randn(1,5)
Ww = np.random.randn(5,5)
Wu = np.random.randn(27,5)
Wv = np.random.randn(5,27)
bb = np.random.randn(1,5)
bc = np.random.randn(1,27)
parameters_rnn = {'Ww':Ww, "Wu":Wu, "Wv":Wv, 'bb':bb, 'bc':bc}
parameters_rnn_last, loss = gradient_descent_rnn(examples,vocab_size,a_ini,parameters_rnn,iters=500)
# 可视化训练结果
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6),dpi=80)
plt.plot(range(500),loss)
plt.grid()
plt.show()
# 利用学习的参数采样,网络将输出一个新的恐龙名字
indices_rnn = sample_rnn(parameters_rnn,a_ini,vocab_size,char_to_ix)
print("sampling: ")
print("list of sampled indices: ", indices_rnn)
print("list of sampled characters: ", [ix_to_char[i] for i in indices_rnn])
print("name of smapled:", ''.join([ix_to_char[i] for i in indices_rnn]))
4.2 LSTM的训练
# 一个样本的损失函数
def lost_sample(y_pred,y):
z = np.sum(y_pred*y,axis=1)
return -np.sum(np.log(z))
# 自定义梯度下降
def gradient_descent_lstm(examples,vocab_size,c_prev,h_prev,parameters_lstm,maxValue=5,learning_rate=0.01,iters=500):
n = len(examples) # 样本总数
wf = parameters_lstm['wf']
uf = parameters_lstm['uf']
bf = parameters_lstm['bf']
wi = parameters_lstm['wi']
ui = parameters_lstm['ui']
bi = parameters_lstm['bi']
wa = parameters_lstm['wa']
ua = parameters_lstm['ua']
ba = parameters_lstm['ba']
wo = parameters_lstm['wo']
uo = parameters_lstm['uo']
bo = parameters_lstm['bo']
vv = parameters_lstm['vv']
bc = parameters_lstm['bc']
loss = []
for i in range(iters):
loss_samples = 0
# 遍历一次所有的样本
for j in range(n):
# 定义一个训练样本,将样本转化为输入矩阵(X_input)和标签矩阵(y)
X = [char_to_ix[ch] for ch in examples[j]] # 得到一个恐龙名字字符对应数字的列表[1, 1, 3, 8, 5, 14, 15, 19]
# 训练样本转化为0ne-hot形式
m = len(X)
X_k = np.zeros((m,vocab_size))
for idx,val in np.ndenumerate(X):
X_k[idx,val] = 1
x_start = np.zeros((1,vocab_size)) #t=0, X0=0
X_input = np.r_[x_start,X_k]
y_end = np.zeros((1,vocab_size))
y_end[0,0] = 1 #y的结束字符"\n"
y = np.r_[X_k,y_end]
F,I,A,O,c_next, h_next, y_pred = lstm_forward(X_input,c_prev,h_prev,parameters_lstm) # 前向计算
parameters_lstm_gradient = lstm_gradient(X_input,c_prev,h_prev,parameters_lstm,F,I,A,O,c_next,h_next,y_pred,y) #得到梯度
parameters_lstm_gradient_clipped = clip_lstm(parameters_lstm_gradient, maxValue) # 梯度修剪
wf -= learning_rate*parameters_lstm_gradient_clipped['delt_wf']
uf -= learning_rate*parameters_lstm_gradient_clipped['delt_uf']
bf -= learning_rate*parameters_lstm_gradient_clipped['delt_bf']
wi -= learning_rate*parameters_lstm_gradient_clipped['delt_wi']
ui -= learning_rate*parameters_lstm_gradient_clipped['delt_ui']
bi -= learning_rate*parameters_lstm_gradient_clipped['delt_bi']
wa -= learning_rate*parameters_lstm_gradient_clipped['delt_wa']
ua -= learning_rate*parameters_lstm_gradient_clipped['delt_ua']
ba -= learning_rate*parameters_lstm_gradient_clipped['delt_ba']
wo -= learning_rate*parameters_lstm_gradient_clipped['delt_wo']
uo -= learning_rate*parameters_lstm_gradient_clipped['delt_uo']
bo -= learning_rate*parameters_lstm_gradient_clipped['delt_bo']
vv -= learning_rate*parameters_lstm_gradient_clipped['delt_vv']
bc -= learning_rate*parameters_lstm_gradient_clipped['delt_bc']
parameters_lstm = {'wf':wf,'uf':uf,'bf':bf,
'wi':wi,'ui':ui,'bi':bi,
'wa':wa,'ua':ua,'ba':ba,
'wo':wo,'uo':uo,'bo':bo,
'vv':vv, 'bc':bc}
loss_samples += lost_sample(y_pred,y)
loss_samples = loss_samples/n #一次迭代的平均损失值
loss.append(loss_samples)
return parameters_lstm, loss
# 读入字符集,并转化为字符串列表
with open('dinos.txt') as f:
examples = f.readlines()
examples = [x.lower().strip() for x in examples]
np.random.seed(1)
np.random.shuffle(examples)
c_prev = np.random.randn(1,5)
h_prev = np.random.randn(1,5)
wf = np.random.randn(5,5)
uf = np.random.randn(27,5)
bf = np.random.randn(1,5)
wi = np.random.randn(5,5)
ui = np.random.randn(27,5)
bi = np.random.randn(1,5)
wa = np.random.randn(5,5)
ua = np.random.randn(27,5)
ba = np.random.randn(1,5)
wo = np.random.randn(5,5)
uo = np.random.randn(27,5)
bo = np.random.randn(1,5)
vv = np.random.randn(5,27)
bc = np.random.randn(1,27)
parameters_lstm = {'wf':wf,'uf':uf,'bf':bf,
'wi':wi,'ui':ui,'bi':bi,
'wa':wa,'ua':ua,'ba':ba,
'wo':wo,'uo':uo,'bo':bo,
'vv':vv, 'bc':bc}
parameters_lstm, loss = gradient_descent_lstm(examples,vocab_size,c_prev,h_prev,parameters_lstm,iters=500)
# 图示化训练结果
plt.figure(figsize=(8,6),dpi=80)
plt.plot(range(500),loss)
plt.grid()
plt.show()
# 利用学习的参数采样,得到一个新的恐龙名字
indices_lstm = sample_lstm(parameters_lstm,c_prev,h_prev,vocab_size,char_to_ix)
print("sampling: ")
print("list of sampled indices: ", indices_lstm)
print("list of sampled characters: ", [ix_to_char[i] for i in indices_lstm])
print("name of smapled:", ''.join([ix_to_char[i] for i in indices_lstm]))