文章目录
RNN的简单介绍和实现
导入包库
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
torch.nn.RNNCell的简单例子
单层RNN:
batchSize = 1
seqLen = 3 # x1 x2 x3
inputSize = 4
hiddenSize = 2
# 初始化
cell = nn.RNNCell(input_size=inputSize, hidden_size=hiddenSize)
# 构造数据
dataset = torch.randn(seqLen, batchSize, inputSize)
# 隐层,权重设为全0
hidden = torch.zeros(batchSize, hiddenSize)
for idx, input in enumerate(dataset): # 分别取x1, x2, x3
print('=' * 20, idx, '=' * 20)
print('Input size: ', input.shape)
hidden = cell(input, hidden)
print('outputs size: ', hidden.shape)
print(hidden)
print('-' * 50)
torch.nn.RNN的简单例子
多层RNN:
其中,inputs = [
x
1
x_1
x1
x
2
x_2
x2 …
x
N
x_N
xN],outputs = [
h
1
h_1
h1
h
2
h_2
h2 …
h
N
h_N
hN]。
h
0
1
h_0^1
h01是第一隐层的初始
h
0
h_0
h0,
h
N
1
h_N^1
hN1是第一隐层的输出
h
N
h_N
hN。
# 多层RNN
batch_size = 1
seq_len = 3
input_size = 4
hidden_size = 2
num_layers = 1
cell = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
inputs = torch.randn(seq_len, batch_size, input_size)
hidden = torch.zeros(num_layers, batchSize, hiddenSize) # h0
out, hidden = cell(inputs, hidden)
print('Output size: ', out.shape)
print('Output: ', out)
print('Hidden size: ', hidden.shape)
print('Hidden: ', hidden)
print('-' * 50)
简单的Seq2Seq例子
要求:训练一个模型来学习"hello" -> “ohlol”
由于"hello"和"ohlol"是str,无法直接参与计算,所以要转换为对应的index。这里使用的是one-hot vectors。
使用torch.nn.RNNCell
'''
因为有四种分类,所以 input size = 4
'''
input_size = 4
batch_size = 1
hidden_size = 4
# 把str映射到index
idx2char = ['e', 'h', 'l', 'o']
x_data = [1, 0, 2, 2, 3]
y_data = [3, 1, 2, 3, 2]
# one-hot vectors
one_hot_lookup = [[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]
x_one_hot = [one_hot_lookup[x] for x in x_data]
inputs = torch.tensor(x_one_hot, dtype=torch.float32).view(-1, batch_size, input_size)
labels = torch.LongTensor(y_data).view(-1, 1) # (seqLen, 1)
# 构造模型
class Model(nn.Module):
def __init__(self, input_size, batch_size, hidden_size):
super(Model, self).__init__()
self.input_size = input_size
self.batch_size = batch_size
self.hidden_size = hidden_size
self.rnncell = nn.RNNCell(input_size=self.input_size, hidden_size=self.hidden_size)
def forward(self, input, hidden):
hidden = self.rnncell(input, hidden)
return hidden
def init_hidden(self):
return torch.zeros(self.batch_size, self.hidden_size) # h0
net = Model(input_size, batch_size, hidden_size)
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.1)
for epoch in range(15):
loss = 0
optimizer.zero_grad() # grad清零
hidden = net.init_hidden() # h0
print("Predicted string: ", end='')
for input, label in zip(inputs, labels):
hidden = net(input, hidden)
loss += criterion(hidden, label) # 一次循环是一个序列的loss,应该计算总序列的loss,所以应该相加,最后再反向传播求梯度
_, idx = hidden.max(dim=1) # 分类最大值的下标
print(idx2char[idx.item()], end='')
loss.backward()
optimizer.step()
print(', Epoch [%d/15] loss=%.4f' % (epoch+1, loss.item()))
使用torch.nn.RNN
# use torch.nn.RNN
input_size = 4
batch_size = 1
hidden_size = 4
num_layers = 1
# 转换为numbers
idx2char = ['e', 'h', 'l', 'o']
x_data = [1, 0, 2, 2, 3]
y_data = [3, 1, 2, 3, 2]
one_hot_lookup = [[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]
x_one_hot = [one_hot_lookup[idx] for idx in x_data] # 将输入转换为one-hot
inputs = torch.tensor(x_one_hot, dtype=torch.float32).view(-1, batch_size, input_size) # 改变形状为(seqLen, batchSize, inputSize)
labels = torch.LongTensor(y_data) # (seqLen, 1)
# 定义网络结构
class Net(nn.Module):
def __init__(self, input_size, batch_size, hidden_size, num_layers):
super(Net, self).__init__()
self.input_size = input_size
self.batch_size = batch_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers)
def forward(self, x, hidden): # 传入x和h
out, hidden = self.rnn(x, hidden)
return out.view(-1, self.hidden_size) # (batch_size * seqLen, hidden_size)
def init_hidden(self):
return torch.zeros(self.num_layers, self.batch_size, self.hidden_size) # h0,也可以不在此定义,在外面直接传入
model = Net(input_size, batch_size, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
for epoch in range(15):
optimizer.zero_grad()
hidden = model.init_hidden() # h0
# forward
# torch.nn.RNN自己会循环操作,所以不需要我们单独去一个一个地循环处理。
outputs = model(inputs, hidden)
l = criterion(outputs, labels)
# backward and update
l.backward()
optimizer.step()
_, predicted = outputs.max(dim=1) # 分类最大值的下标
predicted = predicted.data.numpy()
print('Predicted string: ', ''.join([idx2char[idx] for idx in predicted]), end='')
print(', Epoch [%d/15] loss=%.4f' % (epoch+1, l.item()))
Using embedding and linear layer
one-hot vectors 缺点:
- 维度高
- 稀疏
- 硬编码(对应关系是设置好的,而不是学习到的)
解决办法之一:使用 Embedding vectors
网络结构:
# Embedding and linear layers
# data
idx2char = ['e', 'h', 'l', 'o']
x_data = [1, 0, 2, 2, 3]
y_data = [3, 1, 2, 3, 2]
# nn.Embedding要求input是LongTensor
inputs = torch.LongTensor(x_data)
labels = torch.LongTensor(y_data)
# parameters
num_class = 4
input_size = 4
hidden_size = 8
embedding_size = 10
num_layers = 2
batch_size = 1
seq_len = 5
# input should be (batchSize, seqLen),Target should be (batchSize*seqLen)
inputs = inputs.view(batch_size, seq_len)
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.emd = nn.Embedding(input_size, embedding_size) # 输入的one-hot是4维的(因为input size=4) Output:(*, embedding_size),其中*是 input shape
self.rnn = nn.RNN(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, num_class)
def forward(self, x):
hidden = torch.zeros(num_layers, x.size(0), hidden_size)
x = self.emd(x) # (batch, seqLen, embeddingSize)
x, hidden = self.rnn(x, hidden)
x = self.fc(x)
return x.view(-1, num_class)
net = Model()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.05)
for epoch in range(15):
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
_, predicted = outputs.max(dim=1) # 分类最大值的下标
predicted = predicted.data.numpy()
print('Predicted string is', ''.join([idx2char[idx] for idx in predicted]), end='')
print(', Epoch [%d/15] loss=%.4f' % (epoch+1, loss.item()))