- 学习语言模型,以及如何训练一个语言模型
- 学习torchtext的基本使用方法
- 构建 vocabulary
3.1 word to inde 和 index to word - 学习torch.nn的一些基本模型
4.1 Linear
4.2 RNN
4.3 LSTM
4.4 GRU - RNN的训练技巧
5.1 Gradient Clipping - 如何保存和读取模型
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random,os
USE_CUDA = torch.cuda.is_available()
random.seed(1000)
np.random.seed(1000)
torch.manual_seed(1000)
if USE_CUDA:
torch.cuda.manual_seed_all(1000)
device = torch.device('cuda' if USE_CUDA else 'cpu')
BATCH_SIZE = 32
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 50000
DATA_PATH = r'./data/demo10_pytorch_skip-Gram'
TRAIN_DATA = 'text8.train.txt'
TEST_DATA = 'text8.test.txt'
VALI_DATA = 'text8.dev.txt'
SAVE_MODEL = DATA_PATH + os.sep + 'lossm.pth'
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path=DATA_PATH,
text_field=TEXT,
train=TRAIN_DATA,
validation=VALI_DATA,
test=TEST_DATA)
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
VOCAB_SIZE = len(TEXT.vocab)
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(datasets=(train, val, test),
batch_size=BATCH_SIZE,
device=device,
bptt_len=50,
repeat=False,
shuffle=True)
class RNNModel(torch.nn.Module):
def __init__(self, rnn_type, vocab_size, embed_size, hidden_size, nlayers, dropout=0.5):
''' 该模型包含以下几层:
- 词嵌入层
- 一个循环神经网络层(RNN, LSTM, GRU)
- 一个线性层,从hidden state到输出单词表
- 一个dropout层,用来做regularization
'''
super(RNNModel, self).__init__()
self.drop = torch.nn.Dropout(dropout)
self.encoder = torch.nn.Embedding(vocab_size, embed_size)
if rnn_type in ['LSTM','GRU']:
self.rnn = getattr(torch.nn, rnn_type)(embed_size, hidden_size, nlayers, dropout=dropout)
else:
try:
nonlinearity = {'RNN_TANH':'tanh', 'RNN_RELU':'relu'}[rnn_type]
except KeyError:
raise ValueError("""An invalid option for `--model` was supplied,
options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
self.rnn = torch.nn.RNN(embed_size, hidden_size, nlayers, nonlinearity=nonlinearity, dropout=dropout)
self.decoder = torch.nn.Linear(hidden_size, vocab_size)
self.init_weights()
self.rnn_type = rnn_type
self.hidden_size = hidden_size
self.nlayers = nlayers
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
def forward(self, input, hidden):
''' Forward pass:
- word embedding
- 输入循环神经网络
- 一个线性层从hidden state转化为输出单词表
'''
emb = self.drop(self.encoder(input))
output,hidden = self.rnn(emb, hidden)
decoded = self.decoder(self.drop(output.view(-1, output.size(2))))
return decoded.view(output.size(0),output.size(1),-1), hidden
def init_hidden(self, bsz, requires_grad=True):
weight = next(self.parameters())
if self.rnn_type =='LSTM':
return (weight.new_zeros((self.nlayers, bsz, self.hidden_size), requires_grad=requires_grad),
weight.new_zeros((self.nlayers, bsz, self.hidden_size),requires_grad=requires_grad))
else:
return weight.new_zeros((self.nlayers, bsz, self.hidden_size), requires_grad=requires_grad)
model = RNNModel(rnn_type='LSTM',vocab_size=VOCAB_SIZE, embed_size=EMBEDDING_SIZE,hidden_size=100, nlayers=1)
if USE_CUDA:
model = model.to(device)
def repackage_hidden(h):
"""Wraps hidden states in new Tensors, to detach them from their history."""
if isinstance(h, torch.Tensor):
return h.detach()
else:
return tuple(repackage_hidden(v) for v in h)
def evaluate(model, data):
model.eval()
total_loss = 0.
it = iter(data)
total_count = 0
with torch.no_grad():
hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
for i, batch in enumerate(it):
data, target = batch.text, batch.target
if USE_CUDA:
data, target = data.cuda(), target.cuda()
hidden = repackage_hidden(hidden)
with torch.no_grad():
output, hidden = model(data, hidden)
loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
total_count += np.multiply(*data.size())
total_loss += loss.item() * np.multiply(*data.size())
loss = total_loss/total_count
model.train()
return loss
NUM_EPOCHS = 2
GRAD_CLIP = 5.
val_losses = []
loss_fn = torch.nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)
for epoch in range(NUM_EPOCHS):
model.train()
it = iter(train_iter)
hidden = model.init_hidden(BATCH_SIZE)
for i, batch in enumerate(it):
data, target = batch.text, batch.target
hidden = repackage_hidden(hidden)
output,hidden = model(data, hidden)
loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
optimizer.step()
if i%100 == 0:
print('epoch',epoch,i,loss.item())
if i%1000 == 0:
val_loss = evaluate(model, val_iter)
if len(val_losses) == 0 or val_loss < min(val_losses):
torch.save(model.state_dict(), SAVE_MODEL)
print('best model saved to lm.pth')
else:
scheduler.step()
val_losses.append(val_loss)