前言
Github:代码下载
RNN存在着梯度消失的问题,Hochreiter & Schmidhuber (1997)在1997年提出了LSTM(Long Short-Term Memory)长短期记忆单元来解决这个问题,现在自然语言处理的大部分任务都用到了LSTM,例如机器翻译,文本生成,,同理还有GRU。
数据集
数据集跟上一篇博客RNN文本生成使用的一样,都是古诗的语料库,这里不再赘述。
算法实现
以下公式参考来源:LSTM公式,不过原文没有给出最后隐含层到输出层用的什么激活函数,也没有给出损失函数,所以本文进行了以softmax作为激活函数,以及交叉熵作为损失函数的公式补充,并对此进行了一些修改,使其完整。
前向传播
a
t
=
tanh
(
W
c
x
t
+
U
c
h
t
−
1
)
=
tanh
(
a
^
t
)
i
t
=
σ
(
W
i
x
t
+
U
i
h
t
−
1
)
=
σ
(
i
^
t
)
f
t
=
σ
(
W
f
x
t
+
U
f
h
t
−
1
)
=
σ
(
f
^
t
)
o
t
=
σ
(
W
o
x
t
+
U
o
h
t
−
1
)
=
σ
(
o
^
t
)
c
t
=
i
t
⊙
a
t
+
f
t
⊙
c
t
−
1
c
t
−
1
→
c
t
h
t
=
o
t
⊙
t
a
n
h
(
c
t
)
y
t
=
s
o
f
t
m
a
x
(
W
y
h
t
)
\begin{array}{c} {a^t} = \tanh ({W_c}{x^t} + {U_c}{h^{t - 1}}) = \tanh ({{\hat a}^t})\\ {i^t} = \sigma ({W_i}{x^t} + {U_i}{h^{t - 1}}) = \sigma ({{\hat i}^t})\\ {f^t} = \sigma ({W_f}{x^t} + {U_f}{h^{t - 1}}) = \sigma ({{\hat f}^t})\\ {o^t} = \sigma ({W_o}{x^t} + {U_o}{h^{t - 1}}) = \sigma ({{\hat o}^t})\\ {c^t} = {i^t} \odot {a^t} + {f^t} \odot {c^{t - 1}}\\ {c^{t - 1}} \to {c^t}\\ {h^t} = {o^t} \odot {\rm{tanh}}({c^t})\\ {y^t} = {\mathop{\rm softmax}\nolimits} \left( {{W_y}{h^t}} \right) \end{array}
at=tanh(Wcxt+Ucht−1)=tanh(a^t)it=σ(Wixt+Uiht−1)=σ(i^t)ft=σ(Wfxt+Ufht−1)=σ(f^t)ot=σ(Woxt+Uoht−1)=σ(o^t)ct=it⊙at+ft⊙ct−1ct−1→ctht=ot⊙tanh(ct)yt=softmax(Wyht)
损失函数为:
E
=
−
∑
t
=
1
N
y
t
log
(
a
t
k
)
E =- \sum\limits_{t = 1}^N {{y_t}\log \left( {a_t^k} \right)}
E=−t=1∑Nytlog(atk)
代码实现,跟RNN文本生成并无变化,只是权重初始化、前向传播和后向传播的写法在加了LSTM单元后,有些不同。
def __init__(self):
self.wordDim = 6000
self.hiddenDim = 100
self.Wi, self.Ui = self.initWeights() #输入门
self.Wf, self.Uf = self.initWeights() #遗忘门
self.Wo, self.Uo = self.initWeights() #输出门
self.Wa, self.Ua = self.initWeights() #记忆门
self.Wy = np.random.uniform(-np.sqrt(1. / self.wordDim), np.sqrt(1. / self.wordDim), (self.wordDim, self.hiddenDim)) #隐含层到输出层的权重矩阵(100, 6000)
def initWeights(self):
W = np.random.uniform(-np.sqrt(1. / self.wordDim), np.sqrt(1. / self.wordDim), (self.hiddenDim, self.wordDim)) #输入层到隐含层的权重矩阵(100, 6000)
U = np.random.uniform(-np.sqrt(1. / self.hiddenDim), np.sqrt(1. / self.hiddenDim), (self.hiddenDim, self.hiddenDim)) #隐含层到隐含层的权重矩阵(100, 100)
return W, U
接着,是前向传播的代码。
def forward(self, data): #前向传播,原则上传入一个数据样本和标签
T = len(data)
output = np.zeros((T, self.wordDim, 1)) #输出
hidden = np.zeros((T+1, self.hiddenDim, 1)) #隐层状态
cPre = np.zeros((self.hiddenDim, 1))
states = list()
for t in range(T): #时间循环
state = dict()
X = np.zeros((self.wordDim, 1)) #构建(6000,1)的向量
X[data[t]][0] = 1 #将对应的值置为1,形成词向量
a = np.tanh(np.dot(self.Wa, X) + np.dot(self.Ua, hidden[t-1]))
i = self.sigmoid(np.dot(self.Wi, X) + np.dot(self.Ui, hidden[t-1]))
f = self.sigmoid(np.dot(self.Wf, X) + np.dot(self.Uf, hidden[t-1]))
o = self.sigmoid(np.dot(self.Wo, X) + np.dot(self.Uo, hidden[t-1]))
c = np.multiply(i, a) + np.multiply(f, cPre)
state['a'] = a
state['i'] = i
state['f'] = f
state['o'] = o
state['c'] = c
states.append(state.copy())
cPre = c
hidden[t] = np.multiply(o, np.tanh(c))
y = self.softmax(np.dot(self.Wy, hidden[t]))
output[t] = y
state = dict()
state['c'] = np.zeros((self.hiddenDim, 1))
states.append(state.copy())
return hidden, output, states
后向传播
后向传播的公式如下:
δ y t = { i = j , y t − 1 i = ̸ j , y t δ h t = ∂ E ∂ h t = ( W y ) t δ y t ∂ E ∂ o i t = ∂ E ∂ h i t ⋅ ∂ h i t ∂ o i t = δ h i t ⋅ tanh ( c i t ) ∂ E ∂ c i t = ∂ E ∂ h i t ⋅ ∂ h i t ∂ c i t = δ h i t ⋅ o i t ⋅ ( 1 − tanh 2 ( c i t ) ) δ c t + = δ h t ⊙ o t ⊙ ( 1 − tanh 2 ( c t ) ) ∂ E ∂ i i t = ∂ E ∂ c i t ⋅ ∂ c i t ∂ i i t = δ c i t ⋅ a i t ∂ E ∂ f i t = ∂ E ∂ c i t ⋅ ∂ c i t ∂ f i t = δ c i t ⋅ c i t − 1 ∂ E ∂ a i t = ∂ E ∂ c i t ⋅ ∂ c i t ∂ a i t = δ c i t ⋅ i i t δ a ^ t = δ a t ⊙ ( 1 − tanh 2 ( a ^ t ) ) = δ c i t ⋅ i i t ⊙ ( 1 − tanh 2 ( a ^ t ) ) δ i ^ t = δ i t ⊙ i t ⊙ ( 1 − i t ) = δ c i t ⋅ a i t ⊙ i t ⊙ ( 1 − i t ) δ f ^ t = δ f t ⊙ f t ⊙ ( 1 − f t ) = δ c i t ⋅ c i t − 1 ⊙ f t ⊙ ( 1 − f t ) δ o ^ t = δ o t ⊙ o t ⊙ ( 1 − o t ) = δ h i t ⋅ tanh ( c i t ) ⊙ o t ⊙ ( 1 − o t ) W i = W i − η δ i ^ t ( x t ) T W f = W f − η δ f ^ t ( x t ) T W o = W o − η δ o ^ t ( x t ) T W a = W o − η δ a ^ t ( x t ) T U i = U i − η δ i ^ t ( h t − 1 ) T U f = U f − η δ f ^ t ( h t − 1 ) T U o = U o − η δ o ^ t ( h t − 1 ) T U a = U o − η δ a ^ t ( h t − 1 ) T \delta y^t = \left\{ \begin{array}{l} i = j,{y^t} - 1\\ i =\not j,{y^t} \end{array} \right.\\ \delta {h^t} = {{\partial E} \over {\partial {h^t}}} = {\left( {{W_y}} \right)^t}\delta {y^t}{{\partial E} \over {\partial o_i^t}} = {{\partial E} \over {\partial h_i^t}} \cdot {{\partial h_i^t} \over {\partial o_i^t}} = \delta h_i^t \cdot \tanh (c_i^t){{\partial E} \over {\partial c_i^t}}\\ = {{\partial E} \over {\partial h_i^t}} \cdot {{\partial h_i^t} \over {\partial c_i^t}} = \delta h_i^t \cdot o_i^t \cdot (1 - {\tanh ^2}(c_i^t))\\ \delta {c^t} + = \delta {h^t} \odot {o^t} \odot (1 - {\tanh ^2}({c^t}))\\ {{\partial E} \over {\partial i_i^t}} = {{\partial E} \over {\partial c_i^t}} \cdot {{\partial c_i^t} \over {\partial i_i^t}} = \delta c_i^t \cdot a_i^t \\{{\partial E} \over {\partial f_i^t}} = {{\partial E} \over {\partial c_i^t}} \cdot {{\partial c_i^t} \over {\partial f_i^t}} = \delta c_i^t \cdot c_i^{t - 1}\\ {{\partial E} \over {\partial a_i^t}} = {{\partial E} \over {\partial c_i^t}} \cdot {{\partial c_i^t} \over {\partial a_i^t}} = \delta c_i^t \cdot i_i^t\\ \delta {{\hat a}^t} = \delta {a^t} \odot (1 - {\tanh ^2}({{\hat a}^t})) = \delta c_i^t \cdot i_i^t \odot (1 - {\tanh ^2}({{\hat a}^t}))\\ \delta {{\hat i}^t} = \delta {i^t} \odot {i^t} \odot (1 - {i^t}) = \delta c_i^t \cdot a_i^t \odot {i^t} \odot (1 - {i^t})\\ \delta {{\hat f}^t} = \delta {f^t} \odot {f^t} \odot (1 - {f^t}) = \delta c_i^t \cdot c_i^{t - 1} \odot {f^t} \odot (1 - {f^t})\\ \delta {{\hat o}^t} = \delta {o^t} \odot {o^t} \odot (1 - {o^t}) = \delta h_i^t \cdot \tanh (c_i^t) \odot {o^t} \odot (1 - {o^t})\\ {W_i} = {W_i} - \eta \delta {{\hat i}^t}{\left( {{x^t}} \right)^T}\\ {{W_f} = {W_f} - \eta \delta {{\hat f}^t}{{\left( {{x^t}} \right)}^T}}\\ {W_o} = {W_o} - \eta \delta {{\hat o}^t}{\left( {{x^t}} \right)^T} \\ {W_a} = {W_o} - \eta \delta {{\hat a}^t}{\left( {{x^t}} \right)^T} \\ {U_i} = {U_i} - \eta \delta {{\hat i}^t}{\left( {{h^{t - 1}}} \right)^T} \\ {{U_f} = {U_f} - \eta \delta {{\hat f}^t}{{\left( {{h^{t - 1}}} \right)}^T}} \\ {U_o} = {U_o} - \eta \delta {{\hat o}^t}{\left( {{h^{t - 1}}} \right)^T}\\ {U_a} = {U_o} - \eta \delta {{\hat a}^t}{\left( {{h^{t - 1}}} \right)^T} δyt={i=j,yt−1i≠j,ytδht=∂ht∂E=(Wy)tδyt∂oit∂E=∂hit∂E⋅∂oit∂hit=δhit⋅tanh(cit)∂cit∂E=∂hit∂E⋅∂cit∂hit=δhit⋅oit⋅(1−tanh2(cit))δct+=δht⊙ot⊙(1−tanh2(ct))∂iit∂E=∂cit∂E⋅∂iit∂cit=δcit⋅ait∂fit∂E=∂cit∂E⋅∂fit∂cit=δcit⋅cit−1∂ait∂E=∂cit∂E⋅∂ait∂cit=δcit⋅iitδa^t=δat⊙(1−tanh2(a^t))=δcit⋅iit⊙(1−tanh2(a^t))δi^t=δit⊙it⊙(1−it)=δcit⋅ait⊙it⊙(1−it)δf^t=δft⊙ft⊙(1−ft)=δcit⋅cit−1⊙ft⊙(1−ft)δo^t=δot⊙ot⊙(1−ot)=δhit⋅tanh(cit)⊙ot⊙(1−ot)Wi=Wi−ηδi^t(xt)TWf=Wf−ηδf^t(xt)TWo=Wo−ηδo^t(xt)TWa=Wo−ηδa^t(xt)TUi=Ui−ηδi^t(ht−1)TUf=Uf−ηδf^t(ht−1)TUo=Uo−ηδo^t(ht−1)TUa=Uo−ηδa^t(ht−1)T
def backPropagation(self, data, label, alpha = 0.002): #反向传播
hidden, output, states = self.forward(data) #(N, 6000)
T = len(output) #时间长度=词向量的长度
deltaCPre = np.zeros((self.hiddenDim, 1))
WiUpdate = np.zeros_like(self.Wi)
WfUpdate = np.zeros_like(self.Wf)
WoUpdate = np.zeros_like(self.Wo)
WaUpdate = np.zeros_like(self.Wa)
UiUpdate = np.zeros_like(self.Ui)
UfUpdate = np.zeros_like(self.Uf)
UoUpdate = np.zeros_like(self.Uo)
UaUpdate = np.zeros_like(self.Ua)
WyUpdate = np.zeros_like(self.Wy)
for t in range(T-1, -1, -1):
c = states[t]['c']
i = states[t]['i']
f = states[t]['f']
o = states[t]['o']
a = states[t]['a']
cPre = states[t-1]['c']
X = np.zeros((self.wordDim, 1)) # (6000,1)
X[data[t]][0] = 1 #构建出词向量
output[t][label[t]][0] -= 1 #求导后,输出结点的误差跟output只差在i=j时需要把值减去1
deltaK = output[t].copy() #输出结点的误差
deltaH = np.dot(self.Wy.T, deltaK)
deltaO = np.multiply(np.multiply(deltaH, np.tanh(c)), o * (1 - o))
deltaC = deltaCPre + np.multiply(deltaH, o, 1-(np.tanh(c) ** 2))
deltaCPre = deltaC
deltaA = np.multiply(np.multiply(deltaC, i), 1-(a ** 2)) #a = tanh(a)
deltaI = np.multiply(np.multiply(deltaC, a), i * (1 - i))
deltaF = np.multiply(np.multiply(deltaC, cPre), f * (1 - f))
WiUpdate += np.dot(deltaI, X.T)
WfUpdate += np.dot(deltaF, X.T)
WaUpdate += np.dot(deltaA, X.T)
WoUpdate += np.dot(deltaO, X.T)
UiUpdate += np.dot(deltaI, hidden[t-1].T)
UfUpdate += np.dot(deltaF, hidden[t-1].T)
UaUpdate += np.dot(deltaA, hidden[t-1].T)
UoUpdate += np.dot(deltaO, hidden[t-1].T)
WyUpdate += np.dot(deltaK, hidden[t].T)
# deltaCPre = np.multiply(np.multiply(c, a), i * (1 - i))
self.Wi -= alpha * WiUpdate
self.Wf -= alpha * WfUpdate
self.Wa -= alpha * WaUpdate
self.Wo -= alpha * WoUpdate
self.Ui -= alpha * UiUpdate
self.Uf -= alpha * UfUpdate
self.Ua -= alpha * UaUpdate
self.Uo -= alpha * UoUpdate
self.Wy -= alpha * WyUpdate
这个是反向传播的公式,其它代码与RNN文本生成的代码一样。