场景
这里手动写一个LSTM模型对IMDB评论数据进行情感分析训练。这是我的一个学习课程示例,懂行的大佬别当真。
PyTorch
train.py
import sys
import torch
import tqdm
from visdom import Visdom
def train(dataloader, model, criterion, optimizer, device):
# 实例化一个窗口
viz = Visdom(port=8097)
# 初始化窗口的信息
viz.line([0.], [0.], win='train_loss', opts=dict(title='train loss'))
model.train()
epoch_losses = []
epoch_accs = []
for i, batch in enumerate(tqdm.tqdm(dataloader, desc='training...', file=sys.stdout)):
(label, ids, length) = batch
label = label.to(device)
ids = ids.to(device)
length = length.to(device)
prediction = model(ids, length)
loss = criterion(prediction, label) # loss计算
accuracy = get_accuracy(prediction, label)
# 梯度更新
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_losses.append(loss.item())
epoch_accs.append(accuracy.item())
# 更新监听的信息
viz.line([loss.item()], [i], win='train_loss', update='append')
return epoch_losses, epoch_accs
def evaluate(dataloader, model, criterion, device):
model.eval()
epoch_losses = []
epoch_accs = []
with torch.no_grad():
for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
(label, ids, length) = batch
label = label.to(device)
ids = ids.to(device)
length = length.to(device)
prediction = model(ids, length)
loss = criterion(prediction, label) # loss计算
accuracy = get_accuracy(prediction, label)
epoch_losses.append(loss.item())
epoch_accs.append(accuracy.item())
return epoch_losses, epoch_accs
def get_accuracy(prediction, label):
batch_size, _ = prediction.shape
predicted_classes = prediction.argmax(dim=-1)
correct_predictions = predicted_classes.eq(label).sum()
accuracy = correct_predictions / batch_size
return accuracy
这里主要是训练代码,主要就是监控梯度更新情况,更新梯度,计算损失函数,计算准备率以及验证模型。
LSTM.py
# 定义模型
import torch
class LSTM(torch.nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
dropout_rate, pad_index=0):
super().__init__()
self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
dropout=dropout_rate, batch_first=True)
self.fc = torch.nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
self.dropout = torch.nn.Dropout(dropout_rate)
def forward(self, ids, length):
embedded = self.dropout(self.embedding(ids))
packed_embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True,
enforce_sorted=False)
packed_output, (hidden, cell) = self.lstm(packed_embedded)
output, output_length = torch.nn.utils.rnn.pad_packed_sequence(packed_output)
if self.lstm.bidirectional:
hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
else:
hidden = self.dropout(hidden[-1])
prediction = self.fc(hidden)
return prediction
这里主要是继承torch.nn.Module,实现LSTM模型。
main.py
# This is a sample Python script.
# Press ⌃R to execute it or replace it with your code.
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
import numpy as np
import torch
import torchtext
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from LSTM import LSTM
from train import train, evaluate
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def main():
train_iter = torchtext.datasets.IMDB(root='./data', split='train')
# 创建分词器
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
print(tokenizer('here is the an example!'))
# 构建词汇表
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab(tokenizer('here is the an example <pad> <pad>')))
# 数据处理pipelines
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 1 if x == 'pos' else 0
print(text_pipeline('here is the an example'))
print(label_pipeline('neg'))
def collate_batch(batch):
max_length = 256
pad = text_pipeline('<pad>')
label_list, text_list, length_list = [], [], []
for (_label, _text) in batch:
label_list.append(label_pipeline(_label))
processed_text = text_pipeline(_text)[:max_length]
length_list.append(len(processed_text))
text_list.append((processed_text + pad * max_length)[:max_length])
label_list = torch.tensor(label_list, dtype=torch.int64)
text_list = torch.tensor(text_list, dtype=torch.int64)
length_list = torch.tensor(length_list, dtype=torch.int64)
return label_list.to(device), text_list.to(device), length_list.to(device)
train_dataset = to_map_style_dataset(train_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset,
[num_train, len(train_dataset) - num_train])
train_dataloader = DataLoader(split_train_, batch_size=8, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=8, shuffle=False, collate_fn=collate_batch)
# 实例化模型
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
output_dim = 2
n_layers = 2
bidirectional = True
dropout_rate = 0.5
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate)
model = model.to(device)
# 损失函数与优化方法
lr = 5e-4
criterion = torch.nn.CrossEntropyLoss()
criterion = criterion.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
n_epochs = 10
best_valid_loss = float('inf')
train_losses = []
train_accs = []
valid_losses = []
valid_accs = []
for epoch in range(n_epochs):
train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)
train_losses.extend(train_loss)
train_accs.extend(train_acc)
valid_losses.extend(valid_loss)
valid_accs.extend(valid_acc)
epoch_train_loss = np.mean(train_loss)
epoch_train_acc = np.mean(train_acc)
epoch_valid_loss = np.mean(valid_loss)
epoch_valid_acc = np.mean(valid_acc)
if epoch_valid_loss < best_valid_loss:
best_valid_loss = epoch_valid_loss
torch.save(model.state_dict(), 'lstm.pt')
print(f'epoch: {epoch + 1}')
print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
main()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
以上就是训练LSTM模型的主要代码了。这里主要就是下载IMDB评论数据,创建分词器,构建词汇表,构建标签数据pipelines,实例化自定义LSTM模型,确定损失函数和优化方法,进行训练。
这里我使用自己的笔记本电脑训练了3天,因为这台笔记本的GPU没有被PyTorch支持。训练完成后,会生成lstm.pt模型参数文件。
predict.py
import torch
import torchtext
from LSTM import LSTM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def predict_sentiment(text, model, tokenizer, vocab, device):
tokens = tokenizer(text)
ids = [vocab[t] for t in tokens]
length = torch.LongTensor([len(ids)])
tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
prediction = model(tensor, length).squeeze(dim=0)
probability = torch.softmax(prediction, dim=-1)
predicted_class = prediction.argmax(dim=-1).item()
predicted_probability = probability[predicted_class].item()
predicted_class_title = ['neg', 'pos']
return predicted_class_title[predicted_class], predicted_probability
if __name__ == "__main__":
text = "This film is terrible!"
train_iter = torchtext.datasets.IMDB(root='./data', split='train')
# 创建分词器
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# 构建词汇表
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])
# 加载模型
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
output_dim = 2
n_layers = 2
bidirectional = True
dropout_rate = 0.5
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate)
model.load_state_dict(torch.load('./lstm.pt'))
model.to(device)
model.eval()
print(predict_sentiment(text, model, tokenizer, vocab, device))
这里是使用训练好的LSTM模型,对This film is terrible!进行预测。结果如下:
('neg', 0.9985383749008179)
总结
具体代码如下:
https://github.com/fxtxz2/geektime-lstm
这里是自定义的LSTM模型,现在的PyTorch已经自带了LSTM模型。