LSTM涉及的公式
i
t
=
σ
(
W
i
i
x
t
+
b
i
i
+
W
h
i
h
(
t
−
1
)
+
b
h
i
)
i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi})
it=σ(Wiixt+bii+Whih(t−1)+bhi)
f
t
=
σ
(
W
i
f
x
t
+
b
i
f
+
W
h
f
h
(
t
−
1
)
+
b
h
f
)
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf})
ft=σ(Wifxt+bif+Whfh(t−1)+bhf)
g
t
=
tanh
(
W
i
g
x
t
+
b
i
g
+
W
h
g
h
(
t
−
1
)
+
b
h
g
)
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg})
gt=tanh(Wigxt+big+Whgh(t−1)+bhg)
o
t
=
σ
(
W
i
o
x
t
+
b
i
o
+
W
h
o
h
(
t
−
1
)
+
b
h
o
)
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho})
ot=σ(Wioxt+bio+Whoh(t−1)+bho)
c
t
=
f
t
∗
c
(
t
−
1
)
+
i
t
∗
g
t
c_t = f_t * c_{(t-1)} + i_t * g_t
ct=ft∗c(t−1)+it∗gt
h
t
=
o
t
∗
tanh
(
c
t
)
h_t = o_t * \tanh(c_t)
ht=ot∗tanh(ct)
loss_func = nn.CrossEntropyLoss() 内部计算方式
loss
(
x
,
c
l
a
s
s
)
=
−
log
(
exp
(
x
[
c
l
a
s
s
]
)
∑
j
exp
(
x
[
j
]
)
)
=
−
x
[
c
l
a
s
s
]
+
log
(
∑
j
exp
(
x
[
j
]
)
)
\text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right) = -x[class] + \log\left(\sum_j \exp(x[j])\right)
loss(x,class)=−log(∑jexp(x[j])exp(x[class]))=−x[class]+log(j∑exp(x[j]))
loss
(
x
,
c
l
a
s
s
)
=
w
e
i
g
h
t
[
c
l
a
s
s
]
(
−
x
[
c
l
a
s
s
]
+
log
(
∑
j
exp
(
x
[
j
]
)
)
)
\text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)
loss(x,class)=weight[class](−x[class]+log(j∑exp(x[j])))
图表示
涉及的数学知识
Hadamard product
权重初始化的方式
U
(
−
k
,
k
)
\mathcal{U}(-\sqrt{k}, \sqrt{k})
U(−k,k)
k
=
1
hidden_size
k = \frac{1}{\text{hidden\_size}}
k=hidden_size1
tensor维度不匹配加法、乘法
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
rnn = nn.LSTM(10, 20, 2)
input1 = torch.ones(5, 3)
print(input1)
input2 = torch.randn(3)
print(input2)
print('+++++++++++++++++++++++++++++')
print(input1+input2)
output
tensor([[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.]])
tensor([-0.8281, 1.7018, -0.9728])
+++++++++++++++++++++++++++++
tensor([[0.1719, 2.7018, 0.0272],
[0.1719, 2.7018, 0.0272],
[0.1719, 2.7018, 0.0272],
[0.1719, 2.7018, 0.0272],
[0.1719, 2.7018, 0.0272]])
乘法
rnn = nn.LSTM(10, 20, 2)
input1 = torch.ones(5, 3)
print(input1)
input2 = torch.randn(3, 1)
print(input2)
print('+++++++++++++++++++++++++++++')
print(input1.matmul(input2))
tensor([[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.]])
tensor([[-0.5661],
[-1.7032],
[-2.1534]])
+++++++++++++++++++++++++++++
tensor([[-4.4227],
[-4.4227],
[-4.4227],
[-4.4227],
[-4.4227]])
当变化input2 = torch.randn(3, 1)
为input2 = torch.randn(3)
时输出位
tensor([[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.],
[1., 1., 1.]])
tensor([0.0792, 0.3583, 0.3870])
+++++++++++++++++++++++++++++
tensor([0.8244, 0.8244, 0.8244, 0.8244, 0.8244])
构建一个简单的LSTM并监测输出的维度
rnn = nn.LSTM(10, 20, 2)
input = torch.ones(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))
print(output.shape)
print(hn.shape)
print(cn.shape)
torch.Size([5, 3, 20])
torch.Size([2, 3, 20])
torch.Size([2, 3, 20])
官方例子解析
链接:官方lstm-example
代码
generate_sine_wave.py
import numpy as np
import torch
np.random.seed(2)
T = 20
L = 1000
N = 100
x = np.empty((N, L), 'int64') # 生成一个100*1000维度的随机array
x[:] = np.array(range(L)) + np.random.randint(-4 * T, 4 * T, N).reshape(N, 1) # array(1000) + 在一定范围内[-80,80) 的array(1000,1)
data = np.sin(x / 1.0 / T).astype('float64') # X是一个(100,1000)的array 每一行的每个元素每一列的(index+[-80,80))/20 再sin
print(x)
print(data)
torch.save(data, open('traindata.pt', 'wb'))
'''
>>>x每一行的元素tensor([-12,-11,-10,....,987]) len=1000,一共rows=100
>>>然后再分别除以T=20,再sin
>>> aa=np.random.randint(-4 * 20, 4 * 20, 10).reshape(10,1)
>>> bb= np.array(range(10))
>>> aa
array([[-73],
[ 7],
[-17],
[ 42],
[ 66],
[ 3],
[-60],
[ 34],
[ 44],
[-24]])
>>> bb
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> aa+bb
array([[-73, -72, -71, -70, -69, -68, -67, -66, -65, -64],
[ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
[-17, -16, -15, -14, -13, -12, -11, -10, -9, -8],
[ 42, 43, 44, 45, 46, 47, 48, 49, 50, 51],
[ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75],
[ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
[-60, -59, -58, -57, -56, -55, -54, -53, -52, -51],
[ 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],
[ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53],
[-24, -23, -22, -21, -20, -19, -18, -17, -16, -15]])
维度为2的每个列元素 加维度为1的每个元素作为第一行
'''
train.py
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
class Sequence(nn.Module):
def __init__(self):
super(Sequence, self).__init__()
self.lstm1 = nn.LSTMCell(1, 51) # input_size=1 hidden_size=51
self.lstm2 = nn.LSTMCell(51, 51)
self.linear = nn.Linear(51, 1)
def forward(self, input, future = 0):
outputs = []
h_t = torch.zeros(input.size(0), 51, dtype=torch.double) # input.size(0)=batch_size hidden_size=51
c_t = torch.zeros(input.size(0), 51, dtype=torch.double)
h_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
c_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
# input_t tensor (97,1) 是每一列
for i, input_t in enumerate(input.chunk(input.size(1), dim=1)): # chunk()方法是将数组中的元素进行分块,每一块为一个数组,最终返回由每个块组成的元组。 chunk之后的是一个tuple类型
# h_t(97,51) c_t(97,51)
h_t, c_t = self.lstm1(input_t, (h_t, c_t)) # torch.cat()可以看做 torch.split() 和 torch.chunk()的反操作 参考:https://blog.csdn.net/benbenls/article/details/102974070?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-6.channel_param&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-6.channel_param
# h_t2(97,51) c_t2(97,51)
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = self.linear(h_t2)
outputs += [output] # outputs的每一个元素是(97,1)的tensor len(outputs) = 999
for i in range(future):# if we should predict the future
h_t, c_t = self.lstm1(output, (h_t, c_t))
h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
output = self.linear(h_t2)
outputs += [output] # [] + [1,2,3] = [1,2,3] outputs=[每个元素是tensor(97,1),....]
outputs = torch.stack(outputs, 1).squeeze(2) # (97,999,1) squeeze 之后1就没了
return outputs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--steps', type=int, default=2, help='steps to run')
opt = parser.parse_args()
# set random seed to 0
np.random.seed(0)
torch.manual_seed(0)
# load data and make training set
data = torch.load('traindata.pt') # (100,1000)
input = torch.from_numpy(data[3:, :-1])
target = torch.from_numpy(data[3:, 1:])
test_input = torch.from_numpy(data[:3, :-1])
test_target = torch.from_numpy(data[:3, 1:])
# build the model
seq = Sequence()
seq.double()
criterion = nn.MSELoss()
# use LBFGS as optimizer since we can load the whole data to train
optimizer = optim.LBFGS(seq.parameters(), lr=0.8)
#begin to train
for i in range(opt.steps):
print('STEP: ', i)
def closure(): # A closure that reevaluates the model and returns the loss.
optimizer.zero_grad()
out = seq(input)
loss = criterion(out, target)
print('loss:', loss.item())
loss.backward()
return loss
optimizer.step(closure)
# begin to predict, no need to track gradient here
with torch.no_grad():
future = 1000
pred = seq(test_input, future=future)
loss = criterion(pred[:, :-future], test_target)
print('test loss:', loss.item())
y = pred.detach().numpy()
# draw the result
plt.figure(figsize=(30,10))
plt.title('Predict future values for time sequences\n(Dashlines are predicted values)', fontsize=30)
plt.xlabel('x', fontsize=20)
plt.ylabel('y', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
def draw(yi, color):
plt.plot(np.arange(input.size(1)), yi[:input.size(1)], color, linewidth = 2.0)
plt.plot(np.arange(input.size(1), input.size(1) + future), yi[input.size(1):], color + ':', linewidth = 2.0)
draw(y[0], 'r')
draw(y[1], 'g')
draw(y[2], 'b')
plt.savefig('predict%d.pdf'%i)
plt.close()
手写图片