LSTM整体架构图如下:
遗忘门如下:
第一个遗忘门得到的结果是不是全都属于0-1的数,相当于不同的权重。
输入门(其实也可以叫更新门)如下:
输出门如下:
对于输出门,有两个分支,一个是直接变成下一层的隐藏变量,一个是表示这一层的输出。
代码来源:BiLSTM的PyTorch应用 - mathor
-
'''
-
code by Tae Hwan Jung(Jeff Jung) @graykode, modify by wmathor
-
'''
-
import torch
-
import numpy
as np
-
import torch.nn
as nn
-
import torch.optim
as optim
-
import torch.utils.data
as Data
-
-
dtype = torch.FloatTensor
准备数据
-
sentence
= (
-
'GitHub Actions makes it easy to automate all your software workflows from continuous integration and delivery to issue triage and more'
-
)
-
word
2idx
= {w: i
for i, w
in enumerate(list(
set(
sentence.split())))}
-
idx
2word
= {i: w
for i, w
in enumerate(list(
set(
sentence.split())))}
-
n_
class
= len(word
2idx) # classification problem
-
max_len
= len(
sentence.split())
-
n_hidden
=
5
-
#word
2idx
={
'automate':
0,
'all':
1,
'and':
2,
'integration':
3,
'your':
4,
'issue':
5,
'continuous':
6,
'triage':
7,
'delivery':
8,
'Actions':
9,
-
'from':
10,
'easy':
11,
'software':
12,
'makes':
13,
'it':
14,
'workflows':
15,
'GitHub':
16,
'to':
17,
'more':
18}
-
-
#id
2word就直接与word
2idx的键值对调换就行了
-
-
#n_
class
=
19
-
#max_len
=
21
处理数据
-
def make_
data(
sentence):
-
input_batch
= []
-
target_batch
= []
-
-
words
=
sentence.split()
-
for i
in range(max_len
-
1):
-
input
= [word
2idx[n]
for n
in words[:(i
+
1)]]
-
input
=
input
+ [-
1]
* (max_len
- len(
input))
-
target
= word
2idx[words[i
+
1]]
-
input_batch.append(np.eye(n_
class)[
input])
-
target_batch.append(target)
-
-
return torch.Tensor(
input_batch), torch.LongTensor(target_batch)
-
-
#
input_batch: [max_len
-
1, max_len, n_
class]
-
input_batch, target_batch
= make_
data(
sentence)
-
dataset
=
Data.TensorDataset(
input_batch, target_batch)
-
loader
=
Data.DataLoader(dataset,
16,
True)#
16表示batch_
size根据自己的电脑配置更改,
相关变量可视化
-
class BiLSTM(nn.Module):
-
def __init__(
self):
-
super(BiLSTM,
self).__init__()
-
self.lstm
= nn.LSTM(
input_
size
=n_
class, hidden_
size
=n_hidden, bidirectional
=
True)
-
# fc
-
self.fc
= nn.Linear(n_hidden
*
2, n_
class)
-
-
def forward(
self, X):
-
# X: [batch_
size, max_len, n_
class]
-
batch_
size
= X.shape[
0]
-
input
= X.transpose(
0,
1) #
input : [max_len, batch_
size, n_
class]
-
-
hidden_state
= torch.randn(
1
*
2, batch_
size, n_hidden) # [num_layers(
=
1)
* num_directions(
=
2), batch_
size, n_hidden]
-
cell_state
= torch.randn(
1
*
2, batch_
size, n_hidden) # [num_layers(
=
1)
* num_directions(
=
2), batch_
size, n_hidden]
-
-
outputs, (_, _)
=
self.lstm(
input, (hidden_state, cell_state))
-
outputs
= outputs[-
1] # [batch_
size, n_hidden
*
2]
-
model
=
self.fc(outputs) # model : [batch_
size, n_
class]
-
return model
-
-
model
= BiLSTM()
-
criterion
= nn.CrossEntropyLoss()
-
optimizer
= optim.Adam(model.parameters(), lr
=
0.001)
pytorch对于LSTM的输入输出格式如下图所示:
-
# Training
-
for epoch
in range(
10000):
-
for x, y
in loader:
-
pred
= model(x)
-
loss
= criterion(pred, y)
-
if (epoch
+
1) %
1000
=
=
0:
-
print(
'Epoch:',
'%04d' % (epoch
+
1),
'cost =',
'{:.6f}'.
format(loss))
-
-
optimizer.
zero_grad()
-
loss.backward()
-
optimizer.step()
-
-
# Pred
-
predict
= model(
input_batch).
data.max(
1, keepdim
=
True)[
1]
-
print(
sentence)
-
print([idx
2word[n.item()]
for n
in predict.squeeze()])
效果比原作者的效果要好一点,因为占位符我更改成了 -1,这样就不会影响到标签了。
注意,对于代码的模型的前项传播不懂的,可以看下面:
-
class BiLSTM_
1(nn.Module):
-
def __init__(
self):
-
super(BiLSTM_
1,
self).__init__()
-
self.lstm
= nn.LSTM(
input_
size
=
10, hidden_
size
=
5, bidirectional
=
False)
-
# fc
-
self.fc
= nn.Linear(n_hidden
*
1, n_
class)
-
-
def forward(
self, X):
-
# X: [batch_
size, max_len, n_
class]
-
batch_
size
= X.shape[
0]
-
input
= X.transpose(
0,
1) #
input : [max_len, batch_
size, n_
class]
-
#随机初试化隐藏变量和记忆细胞变量
-
hidden_state
= torch.randn(
1
*
1, batch_
size, n_hidden) # [num_layers(
=
1)
* num_directions(
=
2), batch_
size, n_hidden]
-
cell_state
= torch.randn(
1
*
1, batch_
size, n_hidden) # [num_layers(
=
1)
* num_directions(
=
2), batch_
size, n_hidden]
-
-
outputs, (hc, c)
=
self.lstm(
input, (hidden_state, cell_state))
-
outputs
= outputs # [batch_
size, n_hidden
*
2]
-
#model
=
self.fc(outputs) # model : [batch_
size, n_
class]
-
return outputs,hc, c
-
-
model
= BiLSTM_
1()
-
a
=torch.randn(
2,5,10)
-
output,hc,c
=model(a)
其实每一层的输出,都直接拼接在一起了,而hc只表示最后一层的输出,所以output[-1]==hc的。
最后祝大家学有所成!
文章知识点与官方知识档案匹配,可进一步学习相关知识
Python入门技能树人工智能深度学习
366793 人正在系统学习中