网上有许多讲解LSTM的资料,完备且详细,在这里就不再赘述了。推荐一个个人认为LSTM讲解比较清晰的网址:Understanding LSTM Networks。学习结束LSTM基本结构之后,想要对LSTM中的各种细节问题有进一步的认识。在tensorflow中,LSTM模块已经被封装好了,所以我就想研究一下如何只用numpy实现一个LSTM网络。很巧合的,我在YoutubeLSTM网络 - 智慧的数学(第8周)上找到了一个LSTM教程,其中提供了利用numpy从0构建LSTM的代码。我对该代码进行了阅读,并且在原有英文注释的基础上,大部分语句都给出了中文注释。希望在这里可以帮助到一样想更加详细的了解LSTM的小伙伴们。建议的阅读顺序为从代码最后的执行语句开始看,跟着函数调用的顺序顺着阅读下去就好。其中LSTM的反向传播算法还是需要小伙伴们先掌握理论推导的哦。如有错误,恳请各位指正!
import numpy as np
class RecurrentNeuralNetwork:
# input (word), expected output (next word), num of words (num of recurrences), array expected outputs, learning rate
def __init__(self, xs, ys, rl, eo, lr): # xs:词汇表的长度,ys:词汇表的长度,rl:原始文本长度,eo:以词汇表形式描绘的原文本矩阵,lr:学习速率
# initial input (first word)
self.x = np.zeros(xs) # 输入初始化为长度为词汇表长度的一维全0矩阵
# input size
self.xs = xs # 记录xs值
# expected output (next word)
self.y = np.zeros(ys) # 期望输出初始化为长度为词汇表长度的一维全0矩阵
# output size
self.ys = ys # 记录ys值
# weight matrix for interpreting results from LSTM cell (num words x num words matrix)
self.w = np.random.random((ys, ys)) # 随机初始化w权重,shape为(ys,ys),
# matrix used in RMSprop
self.G = np.zeros_like(self.w) # G为一个shape跟权重矩阵w一样的矩阵,目的是后续在RMSprop更新权重时使用。
# length of the recurrent network - number of recurrences i.e num of words
self.rl = rl # 记录原始文本长度
# learning rate
self.lr = lr # 记录学习速率
# array for storing inputs
self.ia = np.zeros((rl + 1, xs)) # ia为存储输入状态的矩阵,在这里ia也是原始文本通过词汇表形式转化之后的矩阵
# array for storing cell states
self.ca = np.zeros((rl + 1, ys)) # ca为存储细胞状态的矩阵,在这里ca也是通过词汇表形式转化之后的矩阵
# array for storing outputs
self.oa = np.zeros((rl + 1, ys)) # oa为存储输出状态的矩阵,在这里oa也是通过词汇表形式转化之后的矩阵
# array for storing hidden states
self.ha = np.zeros((rl + 1, ys)) # ha为存储隐藏状态的矩阵,在这里ha也是通过词汇表形式转化之后的矩阵
# forget gate
self.af = np.zeros((rl + 1, ys)) # af为遗忘门状态
# input gate
self.ai = np.zeros((rl + 1, ys)) # ai为输入门状态
# cell state
self.ac = np.zeros((rl + 1, ys)) # ac为细胞状态
# output gate
self.ao = np.zeros((rl + 1, ys)) # ao为输出门状态
# array of expected output values
self.eo = np.vstack((np.zeros(eo.shape[0]), eo.T)) # eo的shape为(原始文本长度+1,词汇表长度),具体操作是先将eo转置,然后在转置后的矩阵最上面加上一行0
# declare LSTM cell (input, output, amount of recurrence, learning rate)
self.LSTM = LSTM(xs, ys, rl, lr) #声明LSTM的细胞态,传入参数为输入大小,输出大小,原始文本长度,学习速率
# activation function. simple nonlinearity, convert nums into probabilities between 0 and 1
def sigmoid(self, x): #sigmoid门函数
return 1 / (1 + np.exp(-x))
# the derivative of the sigmoid function. used to compute gradients for backpropagation
def dsigmoid(self, x): #sigmoid的导数
return self.sigmoid(x) * (1 - self.sigmoid(x))
# lets apply a series of matrix operations to our input (curr word) to compute a predicted output (next word)
def forwardProp(self):
for i in range(1, (self.rl + 1)): #原始文本有多长就要循环多少次
self.LSTM.x = np.hstack((self.ha[i - 1], self.x)) #将上一时刻的隐含层状态和当前的输入拼接成一个长的一维向量
cs, hs, f, inp, c, o = self.LSTM.forwardProp() #LSTM经过前向传播,cs:更新后的cell state; hs:这一时刻隐含态的输出; f:遗忘门的输出;inp:输入门的输出; c:与输入门并联的tanh层的输出; o:输出门的输出
# store computed cell state
# 下面分别更新之前存储的各个状态
self.ca[i] = cs
self.ha[i] = hs
self.af[i] = f
self.ai[i] = inp
self.ac[i] = c
self.ao[i] = o
self.oa[i] = self.sigmoid(np.dot(self.w, hs)) #当前隐态hs经过一个权重输出
self.x = self.eo[i - 1] #下一时刻的输入
return self.oa
def backProp(self):
# update our weight matrices (Both in our Recurrent network, as well as the weight matrices inside LSTM cell)
# init an empty error value
totalError = 0 #初始化总错误为0
# initialize matrices for gradient updates
# First, these are RNN level gradients
# cell state
dfcs = np.zeros(self.ys) #细胞状态权重的梯度
# hidden state,
dfhs = np.zeros(self.ys) #隐含状态权重的梯度
# weight matrix
tu = np.zeros((self.ys, self.ys)) #最后根据隐含态输出的权重的梯度
# Next, these are LSTM level gradients
# forget gate
tfu = np.zeros((self.ys, self.xs + self.ys)) #遗忘门的梯度
# input gate
tiu = np.zeros((self.ys, self.xs + self.ys)) #输入门的梯度
# cell unit
tcu = np.zeros((self.ys, self.xs + self.ys)) #细胞单元的梯度
# output gate
tou = np.zeros((self.ys, self.xs + self.ys)) #输出门的梯度
# loop backwards through recurrences
for i in range(self.rl, -1, -1):
# error = calculatedOutput - expectedOutput
error = self.oa[i] - self.eo[i] #输出向量与label的差值
# calculate update for weight matrix
# (error * derivative of the output) * hidden state
tu += np.dot(np.atleast_2d(error * self.dsigmoid(self.oa[i])), np.atleast_2d(self.ha[i]).T) #更新隐含态到输出连接的权重
# Time to propagate error back to exit of LSTM cell
# 1. error * RNN weight matrix
error = np.dot(error, self.w) #继续反向传播,error = error * RNN的w权重矩阵
# 2. set input values of LSTM cell for recurrence i (horizontal stack of arrays, hidden + input)
self.LSTM.x = np.hstack((self.ha[i - 1], self.ia[i])) #LSTM的输入x
# 3. set cell state of LSTM cell for recurrence i (pre-updates)
self.LSTM.cs = self.ca[i] #LSTM的细胞状态
# Finally, call the LSTM cell's backprop, retreive gradient updates
# gradient updates for forget, input, cell unit, and output gates + cell states & hiddens states
fu, iu, cu, ou, dfcs, dfhs = self.LSTM.backProp(error, self.ca[i - 1], self.af[i], self.ai[i], self.ac[i],
self.ao[i], dfcs, dfhs) #LSTM中进行反向传播,fu iu cu ou分别为遗忘门,输入门,细胞状态,输出门的权重变化量
# calculate total error (not necesarry, used to measure training progress)
totalError += np.sum(error)
# accumulate all gradient updates
# forget gate
tfu += fu #更新遗忘门梯度
# input gate
tiu += iu #更新输入门梯度
# cell state
tcu += cu #更新细胞状态梯度
# output gate
tou += ou #更新输出门梯度
# update LSTM matrices with average of accumulated gradient updates
self.LSTM.update(tfu / self.rl, tiu / self.rl, tcu / self.rl, tou / self.rl) #对LSTM中几种状态的梯度进行更新
# update weight matrix with average of accumulated gradient updates
self.update(tu / self.rl) #对隐含态到输出连接的权重的梯度进行更新
# return total error of this iteration
return totalError
def update(self, u):
# vanilla implementation of RMSprop
#以下是利用RMSprop方法进行参数更新
self.G = 0.9 * self.G + 0.1 * u ** 2
self.w -= self.lr / np.sqrt(self.G + 1e-8) * u
return
# this is where we generate some sample text after having fully trained our model
# i.e error is below some threshold
def sample(self): #该函数的作用是利用训练好的模型新生成一个句子
# loop through recurrences - start at 1 so the 0th entry of all arrays will be an array of 0's
for i in range(1, self.rl + 1):
# set input for LSTM cell, combination of input (previous output) and previous hidden state
self.LSTM.x = np.hstack((self.ha[i - 1], self.x)) #将上一隐含层和此刻输入合并
# run forward prop on the LSTM cell, retrieve cell state and hidden state
cs, hs, f, inp, c, o = self.LSTM.forwardProp() #进行前向传播算法
# store input as vector
maxI = np.argmax(self.x)
self.x = np.zeros_like(self.x)
self.x[maxI] = 1 #前三行代码的作用是将x变为[0,0,1,0,0...,0]形式,其中原始x中最大值所在的那个位置对应新x中的1
self.ia[i] = self.x # Use np.argmax?
# store cell states
self.ca[i] = cs
# store hidden state
self.ha[i] = hs
# forget gate
self.af[i] = f
# input gate
self.ai[i] = inp
# cell state
self.ac[i] = c
# output gate
self.ao[i] = o
# calculate output by multiplying hidden state with weight matrix
self.oa[i] = self.sigmoid(np.dot(self.w, hs))
# compute new input
maxI = np.argmax(self.oa[i]) #根据上一时刻的输出,作为下一个时刻的输入
newX = np.zeros_like(self.x)
newX[maxI] = 1
self.x = newX
# return all outputs
return self.oa
class LSTM:
# LSTM cell (input, output, amount of recurrence, learning rate)
def __init__(self, xs, ys, rl, lr): #xs:输入大小(与词汇表长度一致),ys:输出大小(与词汇表长度一致),rl:原始文本长度,lr:学习速率
# input is word length x word length
self.x = np.zeros(xs + ys) #LSTM中是需要将此刻的输入与上一刻的隐含状态合并一起的,所以维度应该为(1,xs + ys)
# input size is word length + word length
self.xs = xs + ys #记录LSTM中输入的大小
# output
self.y = np.zeros(ys) #初始化输出为shape=(1,ys)的全0矩阵
# output size
self.ys = ys #记录输出大小
# cell state intialized as size of prediction
self.cs = np.zeros(ys) #初始化细胞状态为shape=(1,ys)的全0矩阵
# how often to perform recurrence
self.rl = rl #记录原始文本长度,可以表示走多少次循环
# balance the rate of training (learning rate)
self.lr = lr #记录学习速率大小
# init weight matrices for our gates
# forget gate
self.f = np.random.random((ys, xs + ys)) #初始化遗忘门的权重,遗忘门的输入大小为xs + ys,输出大小为ys,包括后面两个门和细胞状态都是如此
# input gate
self.i = np.random.random((ys, xs + ys)) #初始化输入门的权重
# cell state
self.c = np.random.random((ys, xs + ys)) #初始化细胞状态的权重
# output gate
self.o = np.random.random((ys, xs + ys)) #初始化输出门的权重
# forget gate gradient
self.Gf = np.zeros_like(self.f) #存储遗忘门的梯度
# input gate gradient
self.Gi = np.zeros_like(self.i) #存储输入门的梯度
# cell state gradient
self.Gc = np.zeros_like(self.c) #存储细胞状态的梯度
# output gate gradient
self.Go = np.zeros_like(self.o) #存储输出门的梯度
# activation function to activate our forward prop, just like in any type of neural network
def sigmoid(self, x): #sigmoid激活函数
return 1 / (1 + np.exp(-x))
# derivative of sigmoid to help computes gradients
def dsigmoid(self, x): #sigmoid函数的导数
return self.sigmoid(x) * (1 - self.sigmoid(x))
# tanh! another activation function, often used in LSTM cells
# Having stronger gradients: since data is centered around 0,
# the derivatives are higher. To see this, calculate the derivative
# of the tanh function and notice that input values are in the range [0,1].
def tangent(self, x): #tanh的激活函数
return np.tanh(x)
# derivative for computing gradients
def dtangent(self, x): #tanh激活函数的导数
return 1 - np.tanh(x) ** 2
# lets compute a series of matrix multiplications to convert our input into our output
def forwardProp(self): #LSTM的前向传播
f = self.sigmoid(np.dot(self.f, self.x)) #进行矩阵的点乘操作,等同于将xs+ys的维度映射到ys的维度,然后通过sigmoid激活函数
self.cs *= f #遗忘门的输出先和细胞状态cs相乘
i = self.sigmoid(np.dot(self.i, self.x)) #同理,计算输入门的输出
c = self.tangent(np.dot(self.c, self.x)) #此为与输入门并联的tanh层
self.cs += i * c #输入门与tanh层做了矩阵点乘操作之后与cs相加,这里直接i*c是因为两个相同长度的一维向量相乘,numpy中默认为对应项相乘
o = self.sigmoid(np.dot(self.o, self.x)) #此为经过输出门的结果
self.y = o * self.tangent(self.cs) #此为一个LSTM单元最后的输出,即为ht
return self.cs, self.y, f, i, c, o
def backProp(self, e, pcs, f, i, c, o, dfcs, dfhs): #LSTM中的反向传播算法
# error = error + hidden state derivative. clip the value between -6 and 6.
e = np.clip(e + dfhs, -6, 6)
# multiply error by activated cell state to compute output derivative
do = self.tangent(self.cs) * e
# output update = (output deriv * activated output) * input
ou = np.dot(np.atleast_2d(do * self.dtangent(o)).T, np.atleast_2d(self.x))
# derivative of cell state = error * output * deriv of cell state + deriv cell
dcs = np.clip(e * o * self.dtangent(self.cs) + dfcs, -6, 6)
# deriv of cell = deriv cell state * input
dc = dcs * i
# cell update = deriv cell * activated cell * input
cu = np.dot(np.atleast_2d(dc * self.dtangent(c)).T, np.atleast_2d(self.x))
# deriv of input = deriv cell state * cell
di = dcs * c
# input update = (deriv input * activated input) * input
iu = np.dot(np.atleast_2d(di * self.dsigmoid(i)).T, np.atleast_2d(self.x))
# deriv forget = deriv cell state * all cell states
df = dcs * pcs
# forget update = (deriv forget * deriv forget) * input
fu = np.dot(np.atleast_2d(df * self.dsigmoid(f)).T, np.atleast_2d(self.x))
# deriv cell state = deriv cell state * forget
dpcs = dcs * f
# deriv hidden state = (deriv cell * cell) * output + deriv output * output * output deriv input * input * output + deriv forget
# * forget * output
dphs = np.dot(dc, self.c)[:self.ys] + np.dot(do, self.o)[:self.ys] + np.dot(di, self.i)[:self.ys] + np.dot(df,
self.f)[
:self.ys]
# return update gradinets for forget, input, cell, output, cell state, hidden state
return fu, iu, cu, ou, dpcs, dphs
def update(self, fu, iu, cu, ou):
# update forget, input, cell, and output gradients
# 更新遗忘门、输入门、细胞状态和输出门的梯度,也是用的RMSprop方法
self.Gf = 0.9 * self.Gf + 0.1 * fu ** 2
self.Gi = 0.9 * self.Gi + 0.1 * iu ** 2
self.Gc = 0.9 * self.Gc + 0.1 * cu ** 2
self.Go = 0.9 * self.Go + 0.1 * ou ** 2
# update our gates using our gradients
self.f -= self.lr / np.sqrt(self.Gf + 1e-8) * fu
self.i -= self.lr / np.sqrt(self.Gi + 1e-8) * iu
self.c -= self.lr / np.sqrt(self.Gc + 1e-8) * cu
self.o -= self.lr / np.sqrt(self.Go + 1e-8) * ou
return
def LoadText():
# open text and return input and output data (series of words)
# 打开文本并且返回输入值和输出值
with open("eminem.txt", "r") as text_file: # 打开文本,并读取文本数据
data = text_file.read()
text = list(data) # 将读取的文本数据转化为list,如['to','be','or','not','to','be',...]形式
outputSize = len(text) # 输出的大小为文本中元素的个数
data = list(set(text)) # 将文本中出现的词提取出来,并转化为列表形式,可以将该列表看成为词汇表
uniqueWords, dataSize = len(data), len(data) # uniqueWords应该是表示词汇表的长度,dataSize为data的长度
returnData = np.zeros((uniqueWords, dataSize)) # returnData初始化为一个方阵,该方阵的长度为词汇表长度
for i in range(0, dataSize):
returnData[i][i] = 1 # 将returnData方阵的主对角线的值都变为1
returnData = np.append(returnData, np.atleast_2d(data),
axis=0) # 这里的returnData的作用是将词汇表中每一个单词与方阵中对角线上的1一一对应,相当于建立词汇表索引。具体为:在returnData方阵下面加上data列表内容。这里的np.atleast_2d目的是为了将一维的data列表转化为2d,以便与returnData进行append操作,否则会报维度不一致的错误。数字与string进行append,returnData会自动变为类型为String的矩阵。
output = np.zeros((uniqueWords, outputSize)) # output矩阵的shape为(词汇表的长度,文本的长度)
for i in range(0, outputSize): # 该循环的作用是将原始文本列表转化为与词汇表对应的矩阵表示出来
index = np.where(np.asarray(data) == text[i]) # 对于文本中的每一个元素,找到该元素在词汇表中的index
output[:, i] = returnData[0:-1, index[0]].astype(
float).ravel() # 接着根据该index得到returnData中对应的列向量(除去最后一个,因为是单词),并将其转化为类型为float的一维数组。并将该结果作为output矩阵的第i列。可以看出,该行代码的作用是将原始shape为(1,outputSize)的文本列表以shape为(uniqueWords,outputSize)的output矩阵表示,其中output的每一列中只有一个1,其余都是0,表示该单词在词汇表中的位置,类似于onehot编码。
return returnData, uniqueWords, output, outputSize, data # returnData:单位矩阵下方append词汇表,相当于词汇表信息矩阵,uniqueWords:词汇表长度,output:以词汇表形式描绘的原文本矩阵,outputSize:文本长度,data:词汇表
# write the predicted output (series of words) to disk
def ExportText(output, data):
finalOutput = np.zeros_like(output)
prob = np.zeros_like(output[0])
outputText = ""
print(len(data))
print(output.shape[0])
for i in range(0, output.shape[0]):
for j in range(0, output.shape[1]):
prob[j] = output[i][j] / np.sum(output[i])
outputText += np.random.choice(data, p=prob)
with open("output.txt", "w") as text_file:
text_file.write(outputText)
return
#Begin program
print("Beginning")
iterations = 5000 #训练迭代次数
learningRate = 0.001 #学习速率
#load input output data (words)
returnData, numCategories, expectedOutput, outputSize, data = LoadText() #returnData:词汇表信息矩阵,numCategories:词汇表的长度,也相当于最后分类的总类别,expectedOutput:以词汇表形式描绘的原文本矩阵,也可以看成为最后的label,outputSize:文本长度,data:词汇表
print("Done Reading")
#init our RNN using our hyperparams and dataset
RNN = RecurrentNeuralNetwork(numCategories, numCategories, outputSize, expectedOutput, learningRate)#将对应参数传入RecurrentNeuralNetwork类中
#training time!
for i in range(1, iterations): #开始根据设置的迭代次数进行训练
#compute predicted next word
RNN.forwardProp() #RNN的前向传播过程
#update all our weights using our error
error = RNN.backProp() #进行RNN的反向传播并返回总loss值
#once our error/loss is small enough
print("Error on iteration ", i, ": ", error)
if error > -100 and error < 100 or i % 100 == 0: #若loss值在(-100,100)之间,或者当前迭代次数为100的整数倍
#we can finally define a seed word
seed = np.zeros_like(RNN.x)
maxI = np.argmax(np.random.random(RNN.x.shape))
seed[maxI] = 1
RNN.x = seed
#and predict some new text!
output = RNN.sample()
print(output)
#write it all to disk
ExportText(output, data)
print("Done Writing")
print("Complete")