使用pytorch实现序列标注
自实现lstm
import torch
import torch.nn as nn
def prepare_sequence(seq, to_ix):
idxs = [to_ix[w] for w in seq]
return torch.tensor(idxs, dtype=torch.long)
training_data = [
("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
for word in sent:
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
# 实际中通常使用更大的维度如32维, 64维.
# 这里我们使用小的维度, 为了方便查看训练过程中权重的变化.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
x=prepare_sequence(training_data[0][0],word_to_ix)
y=prepare_sequence(training_data[0][1],tag_to_ix)
print('x:',x)
print('y:',y)
x: tensor([0, 1, 2, 3, 4])
y: tensor([0, 1, 2, 0, 1])
def sigmoid(x):
return 1/(1+torch.exp(-1*x))
def tanh(x):
return (torch.exp(x)-torch.exp(-1*x))/ (torch.exp(x)+torch.exp(-1*x))
hidden_dim=6
embedding_dim=6
vocab_size=len(word_to_ix)
def init_hidden():
# 一开始并没有隐藏状态所以我们要先初始化一个
# 关于维度为什么这么设计请参考Pytoch相关文档
# 各个维度的含义是 (num_layers*num_directions, batch_size, hidden_dim)
return (torch.zeros(1, 1, hidden_dim),
torch.zeros(1, 1, hidden_dim))
word_embeddings = nn.Embedding(vocab_size, embedding_dim)
embed=word_embeddings(x)
print(embed)
tensor([[-1.9627, 0.8135, -0.4169, 0.5599, -0.3018, 1.1061],
[-0.3190, 1.0058, -0.7057, 0.1204, 1.4937, 0.0279],
[-0.4799, 2.1392, -0.9231, -1.0999, -1.4840, -0.7990],
[-1.0826, 1.0353, 0.4493, 1.1570, 0.2160, 0.7899],
[ 1.2812, 1.0754, 0.7863, 0.6510, -1.1592, -0.4033]],
grad_fn=<EmbeddingBackward>)
one_hot = torch.zeros(len(x), vocab_size).scatter_(1,x.reshape(len(x),1), 1)
print(one_hot)
tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 1., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 1., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 1., 0., 0., 0., 0.]])
embedding_matrix=torch.randn(vocab_size,embedding_dim)
my_embed=torch.matmul(one_hot,embedding_matrix)
print(my_embed)
tensor([[ 0.4017, -0.0790, -0.7208, 1.0096, -0.6415, 0.3977],
[-1.1770, 0.9600, -0.4552, 0.5287, -1.2346, -1.1289],
[ 1.4698, 0.9478, 0.4281, -0.0136, -0.8808, 0.6587],
[ 1.4614, 0.1628, 0.4880, 1.5886, 1.0572, 0.0694],
[ 1.1160, -0.9236, 0.1572, 0.8014, 0.9089, -0.0327]])
# (句长,batch_size,embedding_size)
input_x=embed.view(len(x), 1, -1)
print(input_x)
tensor([[[-1.9627, 0.8135, -0.4169, 0.5599, -0.3018, 1.1061]],
[[-0.3190, 1.0058, -0.7057, 0.1204, 1.4937, 0.0279]],
[[-0.4799, 2.1392, -0.9231, -1.0999, -1.4840, -0.7990]],
[[-1.0826, 1.0353, 0.4493, 1.1570, 0.2160, 0.7899]],
[[ 1.2812, 1.0754, 0.7863, 0.6510, -1.1592, -0.4033]]],
grad_fn=<ViewBackward>)
1.LSTM
1.1 单独计算
- 第i层,t时刻
j是前一时刻的各个cell
f i ( t ) = σ ( b i f + Σ j U i , j f x j ( t ) + Σ j W i , j f h j ( t − 1 ) ) g i ( t ) = σ ( b i g + Σ j U i , j g x j ( t ) + Σ j W i , j g h j ( t − 1 ) ) q i ( t ) = σ ( b i o + Σ j U i , j o x j ( t ) + Σ j W i , j o h j ( t − 1 ) ) s i ( t ) = f i ( t ) ∗ s i ( t − 1 ) + g i ( t ) ∗ σ ( b i c + Σ j U i , j c x j ( t ) + Σ j W i , j c h j ( t − 1 ) ) h i ( t ) = t a n h ( s i ( t ) ) q i ( t ) f_i^{(t)}=\sigma(b_i^f+\Sigma_jU_{i,j}^fx_j^{(t)}+\Sigma_jW_{i,j}^fh_j{(t-1)})\\ g_i^{(t)}=\sigma(b_i^g+\Sigma_jU_{i,j}^gx_j^{(t)}+\Sigma_jW_{i,j}^gh_j{(t-1)})\\ q_i^{(t)}=\sigma(b_i^o+\Sigma_jU_{i,j}^ox_j^{(t)}+\Sigma_jW_{i,j}^oh_j{(t-1)})\\ s_i^{(t)}=f_i^{(t)}*s_i^{(t-1)}+g_i^{(t)}*\sigma(b_i^c+\Sigma_jU_{i,j}^cx_j^{(t)}+\Sigma_jW_{i,j}^ch_j{(t-1)})\\ h_i^{(t)}=tanh(s_i^{(t)})q_i^{(t)} fi(t)=σ(bif+ΣjUi,jfxj(t)+ΣjWi,jfhj(t−1))gi(t)=σ(big+ΣjUi,jgxj(t)+ΣjWi,jghj(t−1))qi(t)=σ(bio+ΣjUi,joxj(t)+ΣjWi,johj(t−1))si(t)=fi(t)∗si(t−1)+gi(t)∗σ(bic+ΣjUi,jcxj(t)+ΣjWi,jchj(t−1))hi(t)=tanh(si(t))qi(t)
# 输入到2
# U
Uf=torch.randn(embedding_dim,hidden_dim)
Wf=torch.randn(hidden_dim,hidden_dim)
bf=torch.randn(hidden_dim)
print(Uf)
print(Wf)
print(bf)
tensor([[-0.2412, -1.2818, -0.7232, 0.9796, -1.3831, 0.0280],
[-2.2550, 1.0024, 0.3181, 2.4625, 0.8185, -0.1705],
[ 0.6749, -1.4820, 0.1306, -0.0302, 0.1076, -0.4431],
[-1.9521, 1.5941, -0.4877, -0.5115, 0.3042, -0.8965],
[ 0.8267, 0.6762, 1.1087, -0.0376, 0.4959, -0.9688],
[-0.2706, 0.6851, 0.8101, 0.3680, 0.1835, -0.4139]])
tensor([[ 1.1376, -1.2257, -0.3329, -0.1501, 1.0706, -1.1383],
[-0.5685, -0.6473, -0.9684, -0.4290, -0.8083, 0.1783],
[-0.3419, -1.3738, 0.0836, -0.3662, -0.2039, 0.0299],
[ 0.4583, -0.4010, -0.9482, -0.1714, 0.0785, -0.5377],
[ 0.7783, 0.4437, -2.0553, -1.8913, 0.8079, 0.7039],
[-0.5302, -1.1906, -1.2803, 0.0609, 0.3618, 0.7094]])
tensor([ 1.6180, 0.4092, -1.1886, -1.1649, 0.7097, 1.4132])
h0,c0=init_hidden()
print(h0)
print(c0)
tensor([[[0., 0., 0., 0., 0., 0.]]])
tensor([[[0., 0., 0., 0., 0., 0.]]])
print(torch.matmul(input_x[0],Uf))
#0时刻的遗忘门
f1=sigmoid(bf+torch.matmul(input_x[0],Uf)+torch.matmul(h0,Wf))
print(f1)
tensor([[-3.2840, 5.3954, 1.9123, 0.2252, 3.5592, -0.6763]],
grad_fn=<MmBackward>)
tensor([[[0.1590, 0.9970, 0.6734, 0.2810, 0.9862, 0.6763]]],
grad_fn=<MulBackward0>)
torch.matmul(h0,Wf)
tensor([[[0., 0., 0., 0., 0., 0.]]])
# 0时刻输入门
bg=torch.randn(hidden_dim)
Wg=torch.randn(hidden_dim,hidden_dim)
Ug=torch.randn(embedding_dim,hidden_dim)
print(bg)
print(Wg)
print(Ug)
tensor([-0.9991, -0.3109, -0.3376, -1.8703, -0.0876, 0.4118])
tensor([[ 0.1361, 0.8912, 0.3556, -1.1611, -0.4669, -0.7749],
[-0.9517, -2.1878, -1.1335, 1.8934, -0.4701, -0.0386],
[-0.2086, -0.0997, 0.0195, -0.4307, 0.2007, -0.3712],
[-0.0860, -0.7646, -1.0500, -1.3939, 0.3060, 0.5810],
[ 0.9782, 0.1691, 1.3593, -0.1176, 0.2451, 1.2866],
[ 0.3426, 1.1758, -0.1679, -0.7304, 1.8132, 0.7703]])
tensor([[ 0.7740, -0.5909, 0.3731, -0.2821, 0.4309, 0.3201],
[-0.0408, -2.3477, -0.0902, -0.6489, -0.6137, -0.6363],
[-0.3889, 0.7760, -1.5003, -1.6583, 1.7034, 0.6059],
[ 0.9344, -1.5214, -2.2810, -0.9084, 0.4917, -0.0436],
[-0.3241, 0.2920, -1.4197, -0.7704, 1.3797, 1.0030],
[ 0.4039, -1.4007, 1.1480, -0.5950, 0.2726, -0.3568]])
# 0时刻输入门
g1=sigmoid(bg+torch.matmul(input_x[0],Ug)+torch.matmul(h0,Wg))
print(g1)
tensor([[[0.2106, 0.0204, 0.4759, 0.1103, 0.1211, 0.1534]]],
grad_fn=<MulBackward0>)
# 状态
bc=torch.randn(hidden_dim)
Wc=torch.randn(hidden_dim,hidden_dim)
Uc=torch.randn(embedding_dim,hidden_dim)
print(bc)
print(Wc)
print(Uc)
tensor([-1.4072, 0.0440, 0.4973, 2.0482, 0.2032, 0.2510])
tensor([[ 2.0180, -0.5751, 0.4657, -1.3219, 2.4918, -0.8496],
[ 0.2287, -1.4079, -0.0104, -0.3973, 1.3936, 1.2032],
[ 0.5597, 0.8178, -0.2663, -0.0518, -1.2287, 0.7666],
[ 1.4284, -0.6757, 1.3944, 0.3908, -0.1043, 1.7851],
[-0.2318, 0.1908, -0.9405, -1.3440, -2.0447, -2.2236],
[ 0.7214, -0.5389, 1.0935, -0.4707, -0.6584, 0.8625]])
tensor([[ 0.2348, 0.7101, -0.2298, 0.4476, 1.2316, 0.3588],
[ 0.9452, -0.3919, -0.1857, 0.5695, -0.7272, -1.2976],
[ 0.3508, -0.3632, 0.9566, -0.8370, -2.0458, -1.2055],
[-1.4784, -0.9333, -0.7207, 1.8996, -1.0026, -0.0988],
[ 1.0030, -0.2087, -0.4728, -1.4157, -0.3052, -1.1199],
[-1.7926, -1.1267, -0.9589, -0.9056, -0.8777, 0.2443]])
c1=f1*c0+g1*sigmoid(bc+torch.matmul(input_x[0],Uc)+torch.matmul(h0,Wc))
print(c1)
tensor([[[0.0027, 0.0008, 0.1353, 0.1017, 0.0039, 0.0596]]],
grad_fn=<AddBackward0>)
# 输出门参数
bo=torch.randn(hidden_dim)
Wo=torch.randn(hidden_dim,hidden_dim)
Uo=torch.randn(embedding_dim,hidden_dim)
print(bo)
print(Wo)
print(Uo)
tensor([-0.7430, -0.4823, 0.6030, -0.1274, -0.5860, -0.1610])
tensor([[-0.8334, 0.1386, 0.4369, 0.9919, 0.0499, 0.2537],
[ 0.7339, 1.3104, 0.5500, -0.9005, -1.1566, -1.7843],
[-1.6112, -1.0089, -1.0443, 0.3732, -0.6024, -1.1931],
[-1.9338, -0.1763, -0.0256, -0.8732, -1.7940, -1.4747],
[ 0.4316, 1.6072, 0.8072, -0.9294, 0.8270, 0.5840],
[ 0.0676, 0.0690, 0.9222, 0.3463, 0.3679, 0.1482]])
tensor([[-0.9390, -1.6735, 0.2829, 1.0728, -0.6216, 0.2004],
[-0.7808, -0.1753, -0.9838, -1.7960, 0.2015, -0.8450],
[-0.0584, -0.1656, -0.3886, 0.1750, 0.3405, -0.0094],
[ 0.3652, -0.3256, 0.3165, -1.9058, -0.0954, 1.0349],
[-0.1895, -0.2673, -1.4944, 0.7692, -2.3686, 1.3873],
[ 0.9085, 0.9621, 0.8830, -2.6961, -0.2800, -0.7214]])
# 输出门公式
q1=sigmoid(bo+torch.matmul(input_x[0],Uo)+torch.matmul(h0,Wo))
print(q1)
tensor([[[8.5267e-01, 9.7567e-01, 7.3385e-01, 3.1952e-04, 7.3254e-01,
1.3298e-01]]], grad_fn=<MulBackward0>)
# 隐层
h1=tanh(c1)*q1
print(h1)
tensor([[[2.2688e-03, 7.6095e-04, 9.8698e-02, 3.2395e-05, 2.8849e-03,
7.9154e-03]]], grad_fn=<MulBackward0>)
单层LSTM-cell
def LSTM_Cell(input_x,h0,c0):
f1=sigmoid(bf+torch.matmul(input_x,Uf)+torch.matmul(h0,Wf))
# print(f1)
g1=sigmoid(bg+torch.matmul(input_x,Ug)+torch.matmul(h0,Wg))
# print(g1)
gc1=sigmoid(bc+torch.matmul(input_x,Uc)+torch.matmul(h0,Wc))
c1=f1*c0+g1*gc1
# print(c1)
q1=sigmoid(bo+torch.matmul(input_x,Uo)+torch.matmul(h0,Wo))
# print(q1)
h1=tanh(c1)*q1
# print(h1)
return (h1,c1),f1,g1,q1,gc1
(h1,c1),_,_,_,_=LSTM_Cell(input_x[0],h0,c0)
print(h1)
print(c1)
tensor([[[2.2688e-03, 7.6095e-04, 9.8698e-02, 3.2395e-05, 2.8849e-03,
7.9154e-03]]], grad_fn=<MulBackward0>)
tensor([[[0.0027, 0.0008, 0.1353, 0.1017, 0.0039, 0.0596]]],
grad_fn=<AddBackward0>)
单层LSTM
# forward
def single_layer_LSTM(input_x):
h0,c0=init_hidden()
h=torch.zeros(input_x.shape[0],input_x.shape[1],hidden_dim)
c=torch.zeros(input_x.shape[0],input_x.shape[1],hidden_dim)
f=torch.zeros(input_x.shape[0],input_x.shape[1],hidden_dim)
g=torch.zeros(input_x.shape[0],input_x.shape[1],hidden_dim)
q=torch.zeros(input_x.shape[0],input_x.shape[1],hidden_dim)
gc=torch.zeros(input_x.shape[0],input_x.shape[1],hidden_dim)
for i in range(len(input_x)):
(h0,c0),f0,g0,q0,gc0=LSTM_Cell(input_x[i],h0,c0)
h[i]=h0
c[i]=c0
f[i]=f0
g[i]=g0
q[i]=q0
gc[i]=gc0
return h,(h0,c0),c,f,g,q,gc
o,(h1,c1),c,f,g,q,gc=single_layer_LSTM(input_x)
print(o)
print(h1)
print(c1)
tensor([[[2.2688e-03, 7.6095e-04, 9.8698e-02, 3.2395e-05, 2.8849e-03,
7.9154e-03]],
[[2.2140e-02, 7.2907e-03, 1.0052e-02, 2.2311e-02, 4.1039e-03,
8.3458e-02]],
[[6.5945e-03, 2.2127e-02, 3.3992e-01, 1.2278e-01, 2.0307e-01,
1.2748e-03]],
[[1.1699e-02, 3.3651e-02, 1.5326e-01, 1.8081e-04, 6.9607e-02,
3.0697e-02]],
[[7.7960e-03, 7.7988e-04, 1.2081e-01, 4.8651e-02, 1.8456e-01,
3.7786e-02]]], grad_fn=<CopySlices>)
tensor([[[0.0078, 0.0008, 0.1208, 0.0487, 0.1846, 0.0378]]],
grad_fn=<MulBackward0>)
tensor([[[0.1567, 0.0205, 0.1611, 0.3002, 0.2173, 0.2408]]],
grad_fn=<AddBackward0>)
BPTT
一层一层的计算:从T开始
f
i
(
t
)
=
σ
(
b
i
f
+
Σ
j
U
i
,
j
f
x
j
(
t
)
+
Σ
j
W
i
,
j
f
h
j
(
t
−
1
)
)
g
i
(
t
)
=
σ
(
b
i
g
+
Σ
j
U
i
,
j
g
x
j
(
t
)
+
Σ
j
W
i
,
j
g
h
j
(
t
−
1
)
)
q
i
(
t
)
=
σ
(
b
i
o
+
Σ
j
U
i
,
j
o
x
j
(
t
)
+
Σ
j
W
i
,
j
o
h
j
(
t
−
1
)
)
s
i
(
t
)
=
f
i
(
t
)
∗
s
i
(
t
−
1
)
+
g
i
(
t
)
∗
σ
(
b
i
c
+
Σ
j
U
i
,
j
c
x
j
(
t
)
+
Σ
j
W
i
,
j
c
h
j
(
t
−
1
)
)
h
i
(
t
)
=
t
a
n
h
(
s
i
(
t
)
)
q
i
(
t
)
f_i^{(t)}=\sigma(b_i^f+\Sigma_jU_{i,j}^fx_j^{(t)}+\Sigma_jW_{i,j}^fh_j{(t-1)})\\ g_i^{(t)}=\sigma(b_i^g+\Sigma_jU_{i,j}^gx_j^{(t)}+\Sigma_jW_{i,j}^gh_j{(t-1)})\\ q_i^{(t)}=\sigma(b_i^o+\Sigma_jU_{i,j}^ox_j^{(t)}+\Sigma_jW_{i,j}^oh_j{(t-1)})\\ s_i^{(t)}=f_i^{(t)}*s_i^{(t-1)}+g_i^{(t)}*\sigma(b_i^c+\Sigma_jU_{i,j}^cx_j^{(t)}+\Sigma_jW_{i,j}^ch_j{(t-1)})\\ h_i^{(t)}=tanh(s_i^{(t)})q_i^{(t)}
fi(t)=σ(bif+ΣjUi,jfxj(t)+ΣjWi,jfhj(t−1))gi(t)=σ(big+ΣjUi,jgxj(t)+ΣjWi,jghj(t−1))qi(t)=σ(bio+ΣjUi,joxj(t)+ΣjWi,johj(t−1))si(t)=fi(t)∗si(t−1)+gi(t)∗σ(bic+ΣjUi,jcxj(t)+ΣjWi,jchj(t−1))hi(t)=tanh(si(t))qi(t)
下面的未考虑batch
print(o.view(len(x),-1))
tensor([[2.2688e-03, 7.6095e-04, 9.8698e-02, 3.2395e-05, 2.8849e-03, 7.9154e-03],
[2.2140e-02, 7.2907e-03, 1.0052e-02, 2.2311e-02, 4.1039e-03, 8.3458e-02],
[6.5945e-03, 2.2127e-02, 3.3992e-01, 1.2278e-01, 2.0307e-01, 1.2748e-03],
[1.1699e-02, 3.3651e-02, 1.5326e-01, 1.8081e-04, 6.9607e-02, 3.0697e-02],
[7.7960e-03, 7.7988e-04, 1.2081e-01, 4.8651e-02, 1.8456e-01, 3.7786e-02]],
grad_fn=<ViewBackward>)
print(torch.transpose(o.view(len(x),-1),1,0))
tensor([[2.2688e-03, 2.2140e-02, 6.5945e-03, 1.1699e-02, 7.7960e-03],
[7.6095e-04, 7.2907e-03, 2.2127e-02, 3.3651e-02, 7.7988e-04],
[9.8698e-02, 1.0052e-02, 3.3992e-01, 1.5326e-01, 1.2081e-01],
[3.2395e-05, 2.2311e-02, 1.2278e-01, 1.8081e-04, 4.8651e-02],
[2.8849e-03, 4.1039e-03, 2.0307e-01, 6.9607e-02, 1.8456e-01],
[7.9154e-03, 8.3458e-02, 1.2748e-03, 3.0697e-02, 3.7786e-02]],
grad_fn=<TransposeBackward0>)
print(y)
tagset_size=len(tag_to_ix)
tensor([0, 1, 2, 0, 1])
one_hot_y = torch.zeros(len(y), tagset_size).scatter_(1,y.reshape(len(y),1), 1)
dL_do=torch.tensor([[[ 1.1719, 4.0198, -0.1581, -6.9059, -4.1330, 5.0020]],
[[ 1.0842, -0.5113, 0.2987, 0.7790, -0.1800, 1.7739]],
[[-16.1690, -10.2418, 9.0003, 10.4557, 6.8416, -34.2560]],
[[ 0.3115, 1.0683, -0.0420, -1.8353, -1.0984, 1.3294]],
[[ 1.3049, -0.6155, 0.3596, 0.9376, -0.2166, 2.1351]]])
print(dL_do)
tensor([[[ 1.1719, 4.0198, -0.1581, -6.9059, -4.1330, 5.0020]],
[[ 1.0842, -0.5113, 0.2987, 0.7790, -0.1800, 1.7739]],
[[-16.1690, -10.2418, 9.0003, 10.4557, 6.8416, -34.2560]],
[[ 0.3115, 1.0683, -0.0420, -1.8353, -1.0984, 1.3294]],
[[ 1.3049, -0.6155, 0.3596, 0.9376, -0.2166, 2.1351]]])
h i ( t ) = t a n h ( s i ( t ) ) q i ( t ) ∂ L ∂ q i ( t ) = ∂ L ∂ h i ( t ) ∗ t a n h ( s i ( t ) ) ∂ L ∂ s i ( t ) = ∂ L ∂ h i ( t ) ∗ q i ( t ) ∗ ( 1 − t a n h ( s i ( t ) ) 2 ) h_i^{(t)}=tanh(s_i^{(t)})q_i^{(t)}\\ \frac{\partial L}{\partial q_i^{(t)}}=\frac{\partial L}{\partial h_i^{(t)}}*tanh(s_i^{(t)})\\ \frac{\partial L}{\partial s_i^{(t)}}=\frac{\partial L}{\partial h_i^{(t)}}*q_i^{(t)}*(1-tanh(s_i^{(t)})^2) hi(t)=tanh(si(t))qi(t)∂qi(t)∂L=∂hi(t)∂L∗tanh(si(t))∂si(t)∂L=∂hi(t)∂L∗qi(t)∗(1−tanh(si(t))2)
print(tanh(c))
tensor([[[0.0027, 0.0008, 0.1345, 0.1014, 0.0039, 0.0595]],
[[0.1275, 0.0195, 0.1282, 0.1240, 0.2368, 0.1144]],
[[0.1172, 0.0422, 0.6908, 0.6798, 0.2083, 0.1435]],
[[0.0253, 0.0420, 0.3672, 0.3021, 0.2065, 0.1015]],
[[0.1554, 0.0205, 0.1597, 0.2915, 0.2140, 0.2363]]],
grad_fn=<DivBackward0>)
dL_dq=torch.zeros(dL_do.shape)
dL_dq[-1]=tanh(c[-1])*dL_do[-1]
print(dL_dq)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.2028, -0.0126, 0.0574, 0.2733, -0.0463, 0.5045]]],
grad_fn=<CopySlices>)
dL_ds=torch.zeros(dL_do.shape)
dL_ds[-1]=dL_do[-1]*(1-tanh(c[-1])*tanh(c[-1]))*q[-1]
print(dL_ds)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0639, -0.0234, 0.2650, 0.1432, -0.1783, 0.3224]]],
grad_fn=<CopySlices>)
q
i
(
t
)
=
σ
(
b
i
o
+
Σ
j
U
i
,
j
o
x
j
(
t
)
+
Σ
j
W
i
,
j
o
h
j
(
t
−
1
)
)
∂
L
∂
(
b
i
o
+
Σ
j
U
i
,
j
o
x
j
(
t
)
+
Σ
j
W
i
,
j
o
h
j
(
t
−
1
)
)
=
∂
L
∂
b
i
o
=
∂
L
∂
q
i
(
t
)
∗
(
σ
)
′
=
∂
L
∂
q
i
(
t
)
∗
(
1
−
σ
)
σ
=
∂
L
∂
q
i
(
t
)
∗
q
i
(
t
)
∗
(
1
−
q
i
(
t
)
)
∂
L
∂
W
i
,
j
o
=
∂
L
∂
q
i
(
t
)
∗
(
σ
)
′
∗
h
j
(
t
−
1
)
∂
L
∂
U
i
,
j
o
=
∂
L
∂
q
i
(
t
)
∗
(
σ
)
′
∗
x
j
(
t
)
q_i^{(t)}=\sigma(b_i^o+\Sigma_jU_{i,j}^ox_j^{(t)}+\Sigma_jW_{i,j}^oh_j^{(t-1)})\\ \frac{\partial L}{\partial (b_i^o+\Sigma_jU_{i,j}^ox_j^{(t)}+\Sigma_jW_{i,j}^oh_j{(t-1)})}=\frac{\partial L}{\partial b_i^{o}}=\frac{\partial L}{\partial q_i^{(t)}}*(\sigma)'=\frac{\partial L}{\partial q_i^{(t)}}*(1-\sigma)\sigma=\frac{\partial L}{\partial q_i^{(t)}}*q_i^{(t)}*(1-q_i^{(t)})\\ \frac{\partial L}{\partial W_{i,j}^{o}}=\frac{\partial L}{\partial q_i^{(t)}}*(\sigma)'*h_j^{(t-1)}\\ \frac{\partial L}{\partial U_{i,j}^{o}}=\frac{\partial L}{\partial q_i^{(t)}}*(\sigma)'*x_j{(t)}
qi(t)=σ(bio+ΣjUi,joxj(t)+ΣjWi,johj(t−1))∂(bio+ΣjUi,joxj(t)+ΣjWi,johj(t−1))∂L=∂bio∂L=∂qi(t)∂L∗(σ)′=∂qi(t)∂L∗(1−σ)σ=∂qi(t)∂L∗qi(t)∗(1−qi(t))∂Wi,jo∂L=∂qi(t)∂L∗(σ)′∗hj(t−1)∂Ui,jo∂L=∂qi(t)∂L∗(σ)′∗xj(t)
因为这里只有一层,就不求dq/dx了
∂
L
∂
h
j
(
t
−
1
)
=
Σ
i
∂
L
∂
q
i
(
t
)
∗
(
σ
)
′
∗
W
i
,
j
o
\frac{\partial L}{\partial h_j^{(t-1)}}=\Sigma_i \frac{\partial L}{\partial q_i^{(t)}}*(\sigma)'*W_{i,j}^o
∂hj(t−1)∂L=Σi∂qi(t)∂L∗(σ)′∗Wi,jo
LSTM层之后的线性映射层给hj(t-1)传递了一个损失,这个是q传递来的另一个损失。还有通过其他的各种传递过来的。
dL_dqx=torch.zeros(dL_do.shape)
dL_dqx[-1]=dL_dq[-1]*q[-1]*(1-q[-1])
dL_dbo=torch.zeros(bo.shape)
dL_dbo+=dL_dqx[-1,0]
print(dL_dqx)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0097, -0.0005, 0.0106, 0.0380, -0.0055, 0.0678]]],
grad_fn=<CopySlices>)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0525, -0.1445, -0.0005, 0.0053, 0.1187, -0.1321]]],
grad_fn=<CopySlices>)
h_t_1=torch.zeros(o.shape)
h_t_1[1:]=o[:-1]
print(h_t_1[-1,0].reshape(hidden_dim,1))
print(dL_dbo[-1])
print(h_t_1[-1,0].reshape(hidden_dim,1)*dL_dbo[-1])
tensor([[0.0117],
[0.0337],
[0.1533],
[0.0002],
[0.0696],
[0.0307]], grad_fn=<AsStridedBackward>)
tensor(0.0678, grad_fn=<SelectBackward>)
tensor([[7.9293e-04],
[2.2807e-03],
[1.0387e-02],
[1.2254e-05],
[4.7176e-03],
[2.0805e-03]], grad_fn=<MulBackward0>)
dL_dWo=torch.zeros(Wo.shape)
dL_dWo+=h_t_1[-1,0].reshape(hidden_dim,1)*dL_dbo[-1]
print(dL_dWo)
tensor([[7.9293e-04, 7.9293e-04, 7.9293e-04, 7.9293e-04, 7.9293e-04, 7.9293e-04],
[2.2807e-03, 2.2807e-03, 2.2807e-03, 2.2807e-03, 2.2807e-03, 2.2807e-03],
[1.0387e-02, 1.0387e-02, 1.0387e-02, 1.0387e-02, 1.0387e-02, 1.0387e-02],
[1.2254e-05, 1.2254e-05, 1.2254e-05, 1.2254e-05, 1.2254e-05, 1.2254e-05],
[4.7176e-03, 4.7176e-03, 4.7176e-03, 4.7176e-03, 4.7176e-03, 4.7176e-03],
[2.0805e-03, 2.0805e-03, 2.0805e-03, 2.0805e-03, 2.0805e-03, 2.0805e-03]],
grad_fn=<AddBackward0>)
dL_do[-2]+=torch.matmul(dL_dqx[-1],torch.transpose(Wo,1,0))
print(dL_do)
tensor([[[ 1.1719, 4.0198, -0.1581, -6.9059, -4.1330, 5.0020]],
[[ 1.0842, -0.5113, 0.2987, 0.7790, -0.1800, 1.7739]],
[[-16.1690, -10.2418, 9.0003, 10.4557, 6.8416, -34.2560]],
[[ 0.3626, 0.9318, -0.1315, -1.9774, -1.0867, 1.3610]],
[[ 1.3049, -0.6155, 0.3596, 0.9376, -0.2166, 2.1351]]],
grad_fn=<CopySlices>)
print(input_x[-1].reshape(embedding_dim,1))
print(dL_dbo[-1])
dL_dUo=torch.zeros(Uo.shape)
dL_dUo+=input_x[-1].reshape(embedding_dim,1)*dL_dbo[-1]
print(dL_dUo)
tensor([[ 1.2812],
[ 1.0754],
[ 0.7863],
[ 0.6510],
[-1.1592],
[-0.4033]], grad_fn=<AsStridedBackward>)
tensor(0.0678, grad_fn=<SelectBackward>)
tensor([[ 0.0868, 0.0868, 0.0868, 0.0868, 0.0868, 0.0868],
[ 0.0729, 0.0729, 0.0729, 0.0729, 0.0729, 0.0729],
[ 0.0533, 0.0533, 0.0533, 0.0533, 0.0533, 0.0533],
[ 0.0441, 0.0441, 0.0441, 0.0441, 0.0441, 0.0441],
[-0.0786, -0.0786, -0.0786, -0.0786, -0.0786, -0.0786],
[-0.0273, -0.0273, -0.0273, -0.0273, -0.0273, -0.0273]],
grad_fn=<AddBackward0>)
s i ( t ) = f i ( t ) ∗ s i ( t − 1 ) + g i ( t ) ∗ σ ( b i c + Σ j U i , j c x j ( t ) + Σ j W i , j c h j ( t − 1 ) ) ∂ L ∂ f i ( t ) = ∂ L ∂ s i ( t ) ∗ s i ( t − 1 ) ∂ L ∂ s i ( t − 1 ) = ∂ L ∂ s i ( t ) ∗ f i ( t ) ∂ L ∂ g i ( t ) = ∂ L ∂ s i ( t ) ∗ σ ( b i c + Σ j U i , j c x j ( t ) + Σ j W i , j c h j ( t − 1 ) ) s_i^{(t)}=f_i^{(t)}*s_i^{(t-1)}+g_i^{(t)}*\sigma(b_i^c+\Sigma_jU_{i,j}^cx_j^{(t)}+\Sigma_jW_{i,j}^ch_j{(t-1)})\\ \frac{\partial L}{\partial f_i^{(t)}}=\frac{\partial L}{\partial s_i^{(t)}}*s_i^{(t-1)}\\ \frac{\partial L}{\partial s_i^{(t-1)}}=\frac{\partial L}{\partial s_i^{(t)}}*f_i^{(t)}\\ \frac{\partial L}{\partial g_i^{(t)}}=\frac{\partial L}{\partial s_i^{(t)}}*\sigma(b_i^c+\Sigma_jU_{i,j}^cx_j^{(t)}+\Sigma_jW_{i,j}^ch_j{(t-1)}) si(t)=fi(t)∗si(t−1)+gi(t)∗σ(bic+ΣjUi,jcxj(t)+ΣjWi,jchj(t−1))∂fi(t)∂L=∂si(t)∂L∗si(t−1)∂si(t−1)∂L=∂si(t)∂L∗fi(t)∂gi(t)∂L=∂si(t)∂L∗σ(bic+ΣjUi,jcxj(t)+ΣjWi,jchj(t−1))
dL_dg=torch.zeros(dL_do.shape)
dL_dg[-1]=dL_ds[-1]*gc[-1]
print(dL_dg)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0160, -0.0140, 0.2014, 0.1427, -0.0542, 0.1234]]],
grad_fn=<CopySlices>)
dL_df=torch.zeros(dL_do.shape)
dL_df[-1]=dL_ds[-1]*c[-2]
print(dL_df)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0016, -0.0010, 0.1021, 0.0447, -0.0374, 0.0328]]],
grad_fn=<CopySlices>)
dL_ds[-2]+=dL_ds[-1]*f[-1]
print(dL_ds)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0039, -0.0039, 0.0058, 0.1272, -0.0662, 0.2722]],
[[ 0.0639, -0.0234, 0.2650, 0.1432, -0.1783, 0.3224]]],
grad_fn=<CopySlices>)
g c i ( t ) = σ ( b i c + Σ j U i , j c x j ( t ) + Σ j W i , j c h j ( t − 1 ) ) ∂ L ∂ σ i c , ( t ) = ∂ L ∂ s i ( t ) ∗ g i ( t ) ∂ L ∂ b i c = ∂ L ∂ σ i c , ( t ) ∂ L ∂ W i , j c = ∂ L ∂ σ i c , ( t ) ∗ h j ( t − 1 ) ∂ L ∂ U i , j c = ∂ L ∂ σ i c , ( t ) ∗ x j ( t ) ∂ L ∂ h j ( t − 1 ) = Σ i ∂ L ∂ σ i c , ( t ) ∗ W i , j c gc_i^{(t)}=\sigma(b_i^c+\Sigma_jU_{i,j}^cx_j^{(t)}+\Sigma_jW_{i,j}^ch_j{(t-1)})\\ \frac{\partial L}{\partial \sigma_i^{c,(t)}}=\frac{\partial L}{\partial s_i^{(t)}}*g_i^{(t)}\\ \frac{\partial L}{\partial b_i^c}=\frac{\partial L}{\partial \sigma_i^{c,(t)}}\\ \frac{\partial L}{\partial W_{i,j}^c}=\frac{\partial L}{\partial \sigma_i^{c,(t)}}*h_j^{(t-1)}\\ \frac{\partial L}{\partial U_{i,j}^c}=\frac{\partial L}{\partial \sigma_i^{c,(t)}}*x_j^{(t)}\\ \frac{\partial L}{\partial h_j{(t-1)}}=\Sigma_i \frac{\partial L}{\partial \sigma_i^{c,(t)}}*W_{i,j}^{c} gci(t)=σ(bic+ΣjUi,jcxj(t)+ΣjWi,jchj(t−1))∂σic,(t)∂L=∂si(t)∂L∗gi(t)∂bic∂L=∂σic,(t)∂L∂Wi,jc∂L=∂σic,(t)∂L∗hj(t−1)∂Ui,jc∂L=∂σic,(t)∂L∗xj(t)∂hj(t−1)∂L=Σi∂σic,(t)∂L∗Wi,jc
dL_dgcx=torch.zeros(dL_do.shape)
dL_dgcx[-1]=dL_ds[-1]*g[-1]*gc[-1]*(1-gc[-1])
dL_dbc=torch.zeros(bc.shape)
dL_dbc+=dL_dgcx[-1,0]
print(dL_dgcx)
tensor([[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
0.0000e+00]],
[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
0.0000e+00]],
[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
0.0000e+00]],
[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
0.0000e+00]],
[[ 7.4212e-03, -1.2571e-04, 9.7080e-03, 1.1346e-05, -1.7320e-02,
3.0807e-02]]], grad_fn=<CopySlices>)
dL_dWc=torch.zeros(Wc.shape)
dL_dUc=torch.zeros(Uc.shape)
i=-1
dL_dWc+=h_t_1[i, 0].reshape(hidden_dim, 1) * dL_dgcx[i]
dL_dUc += input_x[i].reshape(embedding_dim, 1) * dL_dgcx[i]
print(dL_dUc)
tensor([[ 9.5082e-03, -1.6107e-04, 1.2438e-02, 1.4536e-05, -2.2190e-02,
3.9470e-02],
[ 7.9811e-03, -1.3520e-04, 1.0440e-02, 1.2202e-05, -1.8626e-02,
3.3131e-02],
[ 5.8356e-03, -9.8853e-05, 7.6338e-03, 8.9215e-06, -1.3619e-02,
2.4225e-02],
[ 4.8314e-03, -8.1843e-05, 6.3202e-03, 7.3863e-06, -1.1276e-02,
2.0056e-02],
[-8.6024e-03, 1.4572e-04, -1.1253e-02, -1.3151e-05, 2.0076e-02,
-3.5710e-02],
[-2.9927e-03, 5.0695e-05, -3.9148e-03, -4.5752e-06, 6.9843e-03,
-1.2423e-02]], grad_fn=<AddBackward0>)
dL_do[-2]+=torch.matmul(dL_dgcx[-1],torch.transpose(Wc,1,0))
print(dL_do)
tensor([[[ 1.1719, 4.0198, -0.1581, -6.9059, -4.1330, 5.0020]],
[[ 1.0842, -0.5113, 0.2987, 0.7790, -0.1800, 1.7739]],
[[-16.1690, -10.2418, 9.0003, 10.4557, 6.8416, -34.2560]],
[[ 0.3128, 0.9465, -0.0852, -1.8964, -1.1307, 1.4150]],
[[ 1.3049, -0.6155, 0.3596, 0.9376, -0.2166, 2.1351]]],
grad_fn=<CopySlices>)
f i ( t ) = σ ( b i f + Σ j U i , j f x j ( t ) + Σ j W i , j f h j ( t − 1 ) ) g i ( t ) = σ ( b i g + Σ j U i , j g x j ( t ) + Σ j W i , j g h j ( t − 1 ) ) ∂ L ∂ b i f = ∂ L ∂ f ∗ σ ′ ∂ L ∂ W i , j f = ∂ L ∂ f ∗ σ ′ ∗ h j ( t − 1 ) ∂ L ∂ U i , j f = ∂ L ∂ f ∗ σ ′ ∗ x j ( t ) ∂ L ∂ h j ( t − 1 ) + = Σ i ∂ L ∂ f ∗ W i , j c ∂ L ∂ b i g = ∂ L ∂ σ i c , ( t ) ∂ L ∂ W i , j g = ∂ L ∂ σ i c , ( t ) ∗ h j ( t − 1 ) ∂ L ∂ U i , j g = ∂ L ∂ σ i c , ( t ) ∗ x j ( t ) ∂ L ∂ h j ( t − 1 ) + = Σ i ∂ L ∂ σ i c , ( t ) ∗ W i , j c f_i^{(t)}=\sigma(b_i^f+\Sigma_jU_{i,j}^fx_j^{(t)}+\Sigma_jW_{i,j}^fh_j{(t-1)})\\ g_i^{(t)}=\sigma(b_i^g+\Sigma_jU_{i,j}^gx_j^{(t)}+\Sigma_jW_{i,j}^gh_j{(t-1)})\\ \frac{\partial L}{\partial b_i^f}=\frac{\partial L}{\partial f}*\sigma'\\ \frac{\partial L}{\partial W_{i,j}^f}=\frac{\partial L}{\partial f}*\sigma'*h_j^{(t-1)}\\ \frac{\partial L}{\partial U_{i,j}^f}=\frac{\partial L}{\partial f}*\sigma'*x_j^{(t)}\\ \frac{\partial L}{\partial h_j{(t-1)}}+=\Sigma_i \frac{\partial L}{\partial f}*W_{i,j}^{c}\\ \frac{\partial L}{\partial b_i^g}=\frac{\partial L}{\partial \sigma_i^{c,(t)}}\\ \frac{\partial L}{\partial W_{i,j}^g}=\frac{\partial L}{\partial \sigma_i^{c,(t)}}*h_j^{(t-1)}\\ \frac{\partial L}{\partial U_{i,j}^g}=\frac{\partial L}{\partial \sigma_i^{c,(t)}}*x_j^{(t)}\\ \frac{\partial L}{\partial h_j{(t-1)}}+=\Sigma_i \frac{\partial L}{\partial \sigma_i^{c,(t)}}*W_{i,j}^{c} fi(t)=σ(bif+ΣjUi,jfxj(t)+ΣjWi,jfhj(t−1))gi(t)=σ(big+ΣjUi,jgxj(t)+ΣjWi,jghj(t−1))∂bif∂L=∂f∂L∗σ′∂Wi,jf∂L=∂f∂L∗σ′∗hj(t−1)∂Ui,jf∂L=∂f∂L∗σ′∗xj(t)∂hj(t−1)∂L+=Σi∂f∂L∗Wi,jc∂big∂L=∂σic,(t)∂L∂Wi,jg∂L=∂σic,(t)∂L∗hj(t−1)∂Ui,jg∂L=∂σic,(t)∂L∗xj(t)∂hj(t−1)∂L+=Σi∂σic,(t)∂L∗Wi,jc
dL_dfx=torch.zeros(dL_do.shape)
dL_dbf=torch.zeros(bf.shape)
dL_dWf=torch.zeros(Wf.shape)
dL_dUf=torch.zeros(Uf.shape)
dL_dgx=torch.zeros(dL_do.shape)
dL_dbg=torch.zeros(bg.shape)
dL_dWg=torch.zeros(Wg.shape)
dL_dUg=torch.zeros(Ug.shape)
i=-1
#f
dL_dfx[i] = dL_df[i] * f[i] * (1 - f[i])
dL_dbf += dL_dfx[i, 0]
dL_dWf += h_t_1[i, 0].reshape(hidden_dim, 1) * dL_dfx[i]
dL_dUf += input_x[i].reshape(embedding_dim, 1) * dL_dfx[i]
# g
dL_dgx[i] = dL_dg[i] * g[i] * (1 - g[i])
dL_dbg += dL_dgx[i, 0]
dL_dWg += h_t_1[i, 0].reshape(hidden_dim, 1) * dL_dgx[i]
dL_dUg += input_x[i].reshape(embedding_dim, 1) * dL_dgx[i]
dL_do[i - 1] += torch.matmul(dL_dfx[i], torch.transpose(Wf, 1, 0))
dL_do[i - 1] += torch.matmul(dL_dgx[i], torch.transpose(Wg, 1, 0))
i=-1
dL_dx=torch.zeros(input_x.shape)
dL_dx[i]+=torch.matmul(dL_dqx[i], torch.transpose(Uo, 1, 0))
dL_dx[i]+=torch.matmul(dL_dgcx[i], torch.transpose(Uc, 1, 0))
dL_dx[i]+=torch.matmul(dL_dfx[i], torch.transpose(Uf, 1, 0))
dL_dx[i]+=torch.matmul(dL_dgx[i], torch.transpose(Ug, 1, 0))
print(dL_dx)
tensor([[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.0747, -0.1784, -0.0532, -0.0891, 0.0476, -0.1091]]],
grad_fn=<CopySlices>)
2.序列标注
import torch
import torch.nn as nn
class LSTMTag:
def __init__(self,embedding_dim,hidden_dim,vocab_size,tagset_size,layers_num):
self.hidden_dim=hidden_dim
self.vocab_size=vocab_size
self.tagset_size=tagset_size
self.layers_num=layers_num
self.embedding_dim=embedding_dim
self.embedding = torch.randn(self.vocab_size, self.embedding_dim)
self.hidden=self.init_hidden()
self.lstm=LSTM(self.embedding_dim,self.hidden_dim)
self.lstm1=LSTM(self.hidden_dim,self.hidden_dim)
self.hidden2tag=torch.randn(self.hidden_dim,self.tagset_size)
self.lr=0.001
def init_hidden(self):
# 一开始并没有隐藏状态所以我们要先初始化一个
# 关于维度为什么这么设计请参考Pytoch相关文档
# 各个维度的含义是 (num_layers*num_directions, batch_size, hidden_dim)
return (torch.zeros(1, 1, self.hidden_dim),
torch.zeros(1, 1, self.hidden_dim))
def log_softmax(self,x):
e=torch.exp(x)
s=torch.sum(e,axis=1)
return torch.log(e)-torch.log(s).reshape(x.shape[0],1);
def forward(self,x):
self.one_hot = torch.zeros(len(x), self.vocab_size).scatter_(1,x.reshape(len(x),1), 1)
# embedding_matrix = torch.randn(self.vocab_size, self.embedding_dim)
my_embed = torch.matmul(self.one_hot, self.embedding)
self.o,(h,c)=self.lstm.single_layer_LSTM(my_embed.view(len(x), 1, -1),self.hidden)
# o1,(h1,c1)=self.lstm1.single_layer_LSTM(o,self.hidden)
# print(o1)
tagspace=torch.matmul(self.o.view(len(x), -1),self.hidden2tag)
# print(tagspace)
self.tag_score=self.log_softmax(tagspace)
# print(self.tag_score)
# print(self.o.view(len(x),-1).shape)
return self.tag_score
def BP(self,y):
one_hot_y = torch.zeros(len(y), self.tagset_size).scatter_(1, y.reshape(len(y), 1), -1.)
self.Loss =one_hot_y*self.tag_score
# print(self.Loss)
dL_dtagspace=torch.exp(self.Loss)-1
self.Loss=torch.sum(self.Loss,axis=1)
# print(dL_dtagspace.shape)
d_hidden2tag=torch.matmul(torch.transpose(self.o.view(len(x),-1),1,0),dL_dtagspace)
dL_do=torch.matmul(dL_dtagspace,torch.transpose(self.hidden2tag,1,0))
# print(dL_do)
dL_dembedding=self.lstm.BPTT(dL_do.view(len(x),1,-1))
# print(self.one_hot.shape)
dL_dEm=torch.matmul(torch.transpose(self.one_hot,1,0),dL_dembedding.view(len(y),-1))
# print(d_hidden2tag)
self.hidden2tag=self.hidden2tag-d_hidden2tag*self.lr
self.embedding-=dL_dEm*self.lr
# print(self.hidden2tag)
# d_hidden2tag=dL_dtagspace
class LSTM:
def __init__(self,embedding_dim,hidden_dim):
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
# 遗忘门
self.Uf = torch.randn(self.embedding_dim, self.hidden_dim)
self.Wf = torch.randn(self.hidden_dim, self.hidden_dim)
self.bf = torch.randn(self.hidden_dim)
#输入门
self.Ug = torch.randn(self.embedding_dim, self.hidden_dim)
self.Wg = torch.randn( self.hidden_dim, self.hidden_dim)
self.bg = torch.randn( self.hidden_dim)
# 状态
self.Uc = torch.randn( self.embedding_dim, self.hidden_dim)
self.Wc = torch.randn( self.hidden_dim, self.hidden_dim)
self.bc = torch.randn( self.hidden_dim)
# 输出门参数
self.Uo = torch.randn(self.embedding_dim, self.hidden_dim)
self.Wo = torch.randn(self.hidden_dim, self.hidden_dim)
self.bo = torch.randn(self.hidden_dim)
self.lr=0.001
def sigmoid(self,x):
return 1 / (1 + torch.exp(-1 * x))
def tanh(self,x):
return (torch.exp(x) - torch.exp(-1 * x)) / (torch.exp(x) + torch.exp(-1 * x))
def LSTM_Cell(self,input_x, h0, c0):
f1 = self.sigmoid(self.bf + torch.matmul(input_x, self.Uf) + torch.matmul(h0, self.Wf))
# print(f1)
g1 =self.sigmoid(self.bg + torch.matmul(input_x, self.Ug) + torch.matmul(h0, self.Wg))
# print(g1)
gc0=self.sigmoid(self.bc + torch.matmul(input_x, self.Uc) + torch.matmul(h0, self.Wc))
c1 = f1 * c0 + g1 * gc0
# print(c1)
q1 = self.sigmoid(self.bo + torch.matmul(input_x, self.Uo) + torch.matmul(h0, self.Wo))
# print(q1)
h1 = self.tanh(c1) * q1
# print(h1)
return (h1, c1),f1,g1,q1,gc0
# forward
def single_layer_LSTM(self,input_x,hidden):
h0,c0=hidden
self.h = torch.zeros(input_x.shape[0], input_x.shape[1], self.hidden_dim)
self.c = torch.zeros(input_x.shape[0], input_x.shape[1], self.hidden_dim)
self.f = torch.zeros(input_x.shape[0], input_x.shape[1], self.hidden_dim)
self.g = torch.zeros(input_x.shape[0], input_x.shape[1], self.hidden_dim)
self.q = torch.zeros(input_x.shape[0], input_x.shape[1], self.hidden_dim)
self.x=input_x
self.gc=torch.zeros(input_x.shape[0], input_x.shape[1], self.hidden_dim)
for i in range(len(input_x)):
(h0, c0),f0,g0,q0,gc0 = self.LSTM_Cell(input_x[i], h0, c0)
self.h[i] = h0
self.c[i] = c0
self.f[i] = f0
self.g[i] = g0
self.q[i] = q0
self.gc[i] = gc0
return self.h, (h0, c0)
def BPTT(self,dL_do):
# dL_do=torch.cat((torch.zeros(1,dL_do.shape[1],dL_do.shape[2]),dL_do),axis=0)
dL_dq=torch.zeros(dL_do.shape)
dL_ds=torch.zeros(dL_do.shape)
dL_dqx = torch.zeros(dL_do.shape)
# q
dL_dbo = torch.zeros(self.bo.shape)
h_t_1 = torch.zeros(self.h.shape)
h_t_1[1:] = self.h[:-1]
c_t_1 = torch.zeros(self.c.shape)
c_t_1[1:] = self.c[:-1]
dL_dWo = torch.zeros(self.Wo.shape)
dL_dUo = torch.zeros(self.Uo.shape)
# s
dL_df=torch.zeros(dL_do.shape)
dL_dg = torch.zeros(dL_do.shape)
dL_dgcx = torch.zeros(dL_do.shape)
#gc
dL_dbc = torch.zeros(self.bc.shape)
dL_dWc=torch.zeros(self.Wc.shape)
dL_dUc=torch.zeros(self.Uc.shape)
#f
dL_dfx=torch.zeros(dL_do.shape)
dL_dbf=torch.zeros(self.bf.shape)
dL_dWf=torch.zeros(self.Wf.shape)
dL_dUf=torch.zeros(self.Uf.shape)
#g
dL_dgx=torch.zeros(dL_do.shape)
dL_dbg=torch.zeros(self.bg.shape)
dL_dWg=torch.zeros(self.Wg.shape)
dL_dUg=torch.zeros(self.Ug.shape)
dL_dx = torch.zeros(self.x.shape)
for i in range(len(dL_do)-1,-1,-1):
#$ print(i)
dL_dq[i] = self.tanh(self.c[i]) * dL_do[i]
dL_ds[i] += dL_do[i] * (1 - self.tanh(self.c[i]) * self.tanh(self.c[i])) * self.q[i]
dL_dqx[i] = dL_dq[i] * self.q [i]* (1 - self.q[i])
dL_dbo+=dL_dqx[i,0]
dL_dWo += h_t_1[i, 0].reshape(self.hidden_dim, 1) * dL_dqx[i]
# dL_dbo = dL_dqx
dL_dUo += self.x[i].reshape(self.embedding_dim, 1) * dL_dqx[i]
# s
dL_df[i]=dL_ds[i]*c_t_1[i]
dL_dg[i]=dL_ds[i]*self.gc[i]
dL_dgcx[i]=dL_ds[i]*self.g[i]*self.gc[i]*(1-self.gc[i])
#gc
dL_dbc+=dL_dgcx[i,0]
dL_dWc+=h_t_1[i, 0].reshape(self.hidden_dim, 1) * dL_dgcx[i]
dL_dUc += self.x[i].reshape(self.embedding_dim, 1) * dL_dgcx[i]
#f
dL_dfx[i] = dL_df[i] * self.f[i] * (1 - self.f[i])
dL_dbf += dL_dfx[i, 0]
dL_dWf += h_t_1[i, 0].reshape(self.hidden_dim, 1) * dL_dfx[i]
dL_dUf += self.x[i].reshape(self.embedding_dim, 1) * dL_dfx[i]
# g
dL_dgx[i] = dL_dg[i] * self.g[i] * (1 - self.g[i])
dL_dbg += dL_dgx[i, 0]
dL_dWg += h_t_1[i, 0].reshape(self.hidden_dim, 1) * dL_dgx[i]
dL_dUg += self.x[i].reshape(self.embedding_dim, 1) * dL_dgx[i]
if(i>1):
dL_do[i-1]+=torch.matmul(dL_dqx[i],torch.transpose(self.Wo,1,0))
dL_do[i - 1] += torch.matmul(dL_dgcx[i], torch.transpose(self.Wc, 1, 0))
dL_do[i - 1] += torch.matmul(dL_dfx[i], torch.transpose(self.Wf, 1, 0))
dL_do[i - 1] += torch.matmul(dL_dgx[i], torch.transpose(self.Wg, 1, 0))
dL_ds[i-1]+=dL_ds[i]*self.f[i]
dL_dx[i] += torch.matmul(dL_dqx[i], torch.transpose(self.Uo, 1, 0))
# print(dL_dx)
dL_dx[i] += torch.matmul(dL_dgcx[i], torch.transpose(self.Uc, 1, 0))
# print(dL_dx)
dL_dx[i] += torch.matmul(dL_dfx[i], torch.transpose(self.Uf, 1, 0))
dL_dx[i] += torch.matmul(dL_dgx[i], torch.transpose(self.Ug, 1, 0))
self.Wo-=self.lr*dL_dWo
self.bo-=self.lr*dL_dbo
self.Uo-=self.lr*dL_dUo
self.Wc -= self.lr * dL_dWc
self.bc-= self.lr * dL_dbc
self.Uc -= self.lr * dL_dUc
self.Wf -= self.lr * dL_dWf
self.bf -= self.lr * dL_dbf
self.Uf -= self.lr * dL_dUf
self.Wg -= self.lr * dL_dWg
self.bg -= self.lr * dL_dbg
self.Ug -= self.lr * dL_dUg
return dL_dx
def prepare_sequence(seq, to_ix):
idxs = [to_ix[w] for w in seq]
return torch.tensor(idxs, dtype=torch.long)
training_data = [
("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
for word in sent:
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
# 实际中通常使用更大的维度如32维, 64维.
# 这里我们使用小的维度, 为了方便查看训练过程中权重的变化.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
model = LSTMTag(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix),1)
print("训练前")
x=prepare_sequence(training_data[0][0],word_to_ix)
y=prepare_sequence(training_data[0][1],tag_to_ix)
print(torch.max(model.forward(x),axis=1))
# model.BP(y)
for epoch in range(30):
for sentence, tags in training_data:
x = prepare_sequence(sentence, word_to_ix)
y = prepare_sequence(tags, tag_to_ix)
model.forward(x)
model.BP(y)
print("训练后")
x=prepare_sequence(training_data[0][0],word_to_ix)
y=prepare_sequence(training_data[0][1],tag_to_ix)
print(torch.max(model.forward(x),axis=1))
{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
训练前
torch.return_types.max(
values=tensor([-0.6463, -0.5826, -0.7066, -0.2778, -0.2951]),
indices=tensor([1, 1, 1, 1, 1]))
训练后
torch.return_types.max(
values=tensor([-0.6426, -0.2794, -0.1518, -0.1473, -0.8550]),
indices=tensor([1, 1, 1, 1, 2]))