LSTM理解
这篇博客讲的很清楚LSTM的torch代码与LSTM参数维度问题
LSTM实现的代码
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
class Rnn(nn.Module):
def __init__(self, in_dim, hidden_dim, n_layer, n_classes):
super(Rnn, self).__init__()
self.n_layer = n_layer
self.hidden_dim = hidden_dim
self.lstm = nn.LSTM(in_dim, hidden_dim, n_layer, batch_first=True)
self.classifier = nn.Linear(hidden_dim, n_classes)
def forward(self, x):
# 不创建h_0和c_0LSTM会自动创建一个默认的值为0的初始化
h_0 = torch.zeros(self.n_layer, 128, self.hidden_dim)
c_0 = torch.zeros(self.n_layer, 128, self.hidden_dim)
out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
# 此时可以从out中获得最终输出的状态h
# x = out[:, -1, :]
x = h_n[-1, :, :]
x = self.classifier(x)
return x
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5]),
])
trainset = torchvision.datasets.MNIST(root='../dataset/mnist', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
testset = torchvision.datasets.MNIST(root='../dataset/mnist', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False)
net = Rnn(28, 10, 2, 10)
net = net.to('cpu')
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)
# Training
def train(epoch):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.to('cpu'), targets.to('cpu')
optimizer.zero_grad()
outputs = net(torch.squeeze(inputs, 1))
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
% (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
def test(epoch):
global best_acc
net.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(testloader):
inputs, targets = inputs.to('cpu'), targets.to('cpu')
outputs = net(torch.squeeze(inputs, 1))
loss = criterion(outputs, targets)
test_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
% (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
for epoch in range(200):
train(epoch)
# test(epoch)
-
上述的过程的线性变换没有使用偏置。隐藏状态参数不再是标准RNN的4倍,而是3倍,也就是GRU的参数要比LSTM的参数量要少,但是性能差不多。
self.lstm = nn.LSTM(in_dim, hidden_dim, n_layer, batch_first=True)
GRU实现的部分代码
>>> import torch.nn as nn
>>> gru = nn.GRU(input_size=50, hidden_size=50, batch_first=True)
>>> embed = nn.Embedding(3, 50)
>>> x = torch.LongTensor([[0, 1, 2]])
>>> x_embed = embed(x)
>>> x.size()
torch.Size([1, 3])
>>> x_embed.size()
torch.Size([1, 3, 50])
>>> out, hidden = gru(x_embed)
>>> out.size()
torch.Size([1, 3, 50])
>>> hidden.size()
torch.Size([1, 1, 50])
重点
从上面两张图片可以看出来,其实LSTM或者GRU与RNN使用的参数维度是一样的。就是说torch.nn.LSTM和torch.nn.GRU与torch.nn.RNN中模型中构造的参数过程是一样的,他们三个初始化时需要提供的参数是一样的。只是他们获得输入的维度之后,构造的参数矩阵的个数与参数矩阵的类型可能不一样。所有LSTM、GRU和RNN输入的参数也基本一致的。