学习使用LSTM和IMDB评论数据进行情感分析训练

场景

这里手动写一个LSTM模型对IMDB评论数据进行情感分析训练。这是我的一个学习课程示例,懂行的大佬别当真。

PyTorch

train.py

import sys

import torch
import tqdm
from visdom import Visdom


def train(dataloader, model, criterion, optimizer, device):
    # 实例化一个窗口
    viz = Visdom(port=8097)
    # 初始化窗口的信息
    viz.line([0.], [0.], win='train_loss', opts=dict(title='train loss'))

    model.train()
    epoch_losses = []
    epoch_accs = []
    for i, batch in enumerate(tqdm.tqdm(dataloader, desc='training...', file=sys.stdout)):
        (label, ids, length) = batch
        label = label.to(device)
        ids = ids.to(device)
        length = length.to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label) # loss计算
        accuracy = get_accuracy(prediction, label)
        # 梯度更新
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
        # 更新监听的信息
        viz.line([loss.item()], [i], win='train_loss', update='append')
    return epoch_losses, epoch_accs

def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            (label, ids, length) = batch
            label = label.to(device)
            ids = ids.to(device)
            length = length.to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label) # loss计算
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return epoch_losses, epoch_accs

def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

这里主要是训练代码,主要就是监控梯度更新情况,更新梯度,计算损失函数,计算准备率以及验证模型。

LSTM.py

# 定义模型
import torch


class LSTM(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
                 dropout_rate, pad_index=0):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                                  dropout=dropout_rate, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout_rate)

    def forward(self, ids, length):
        embedded = self.dropout(self.embedding(ids))
        packed_embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True,
                                                                  enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_length = torch.nn.utils.rnn.pad_packed_sequence(packed_output)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
        else:
            hidden = self.dropout(hidden[-1])
        prediction = self.fc(hidden)
        return prediction

这里主要是继承torch.nn.Module,实现LSTM模型。

main.py

# This is a sample Python script.

# Press ⌃R to execute it or replace it with your code.
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
import numpy as np
import torch
import torchtext
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

from LSTM import LSTM
from train import train, evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def main():
    train_iter = torchtext.datasets.IMDB(root='./data', split='train')
    # 创建分词器
    tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
    print(tokenizer('here is the an example!'))

    # 构建词汇表
    def yield_tokens(data_iter):
        for _, text in data_iter:
            yield tokenizer(text)

    vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    print(vocab(tokenizer('here is the an example <pad> <pad>')))

    # 数据处理pipelines
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: 1 if x == 'pos' else 0

    print(text_pipeline('here is the an example'))

    print(label_pipeline('neg'))

    def collate_batch(batch):
        max_length = 256
        pad = text_pipeline('<pad>')
        label_list, text_list, length_list = [], [], []
        for (_label, _text) in batch:
            label_list.append(label_pipeline(_label))
            processed_text = text_pipeline(_text)[:max_length]
            length_list.append(len(processed_text))
            text_list.append((processed_text + pad * max_length)[:max_length])
        label_list = torch.tensor(label_list, dtype=torch.int64)
        text_list = torch.tensor(text_list, dtype=torch.int64)
        length_list = torch.tensor(length_list, dtype=torch.int64)
        return label_list.to(device), text_list.to(device), length_list.to(device)

    train_dataset = to_map_style_dataset(train_iter)
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = random_split(train_dataset,
                                              [num_train, len(train_dataset) - num_train])
    train_dataloader = DataLoader(split_train_, batch_size=8, shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_, batch_size=8, shuffle=False, collate_fn=collate_batch)

    # 实例化模型
    vocab_size = len(vocab)
    embedding_dim = 300
    hidden_dim = 300
    output_dim = 2
    n_layers = 2
    bidirectional = True
    dropout_rate = 0.5

    model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate)
    model = model.to(device)

    # 损失函数与优化方法
    lr = 5e-4
    criterion = torch.nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    n_epochs = 10
    best_valid_loss = float('inf')

    train_losses = []
    train_accs = []
    valid_losses = []
    valid_accs = []

    for epoch in range(n_epochs):
        train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
        valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)
        train_losses.extend(train_loss)
        train_accs.extend(train_acc)
        valid_losses.extend(valid_loss)
        valid_accs.extend(valid_acc)
        epoch_train_loss = np.mean(train_loss)
        epoch_train_acc = np.mean(train_acc)
        epoch_valid_loss = np.mean(valid_loss)
        epoch_valid_acc = np.mean(valid_acc)
        if epoch_valid_loss < best_valid_loss:
            best_valid_loss = epoch_valid_loss
            torch.save(model.state_dict(), 'lstm.pt')
        print(f'epoch: {epoch + 1}')
        print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
        print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    main()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

以上就是训练LSTM模型的主要代码了。这里主要就是下载IMDB评论数据,创建分词器,构建词汇表,构建标签数据pipelines,实例化自定义LSTM模型,确定损失函数和优化方法,进行训练。
这里我使用自己的笔记本电脑训练了3天,因为这台笔记本的GPU没有被PyTorch支持。训练完成后,会生成lstm.pt模型参数文件。

predict.py

import torch
import torchtext

from LSTM import LSTM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_sentiment(text, model, tokenizer, vocab, device):
    tokens = tokenizer(text)
    ids = [vocab[t] for t in tokens]
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    predicted_class_title = ['neg', 'pos']
    return predicted_class_title[predicted_class], predicted_probability

if __name__ == "__main__":
    text = "This film is terrible!"

    train_iter = torchtext.datasets.IMDB(root='./data', split='train')
    # 创建分词器
    tokenizer = torchtext.data.utils.get_tokenizer('basic_english')


    # 构建词汇表
    def yield_tokens(data_iter):
        for _, text in data_iter:
            yield tokenizer(text)


    vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_iter), specials=["<pad>", "<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    # 加载模型
    vocab_size = len(vocab)
    embedding_dim = 300
    hidden_dim = 300
    output_dim = 2
    n_layers = 2
    bidirectional = True
    dropout_rate = 0.5
    model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate)
    model.load_state_dict(torch.load('./lstm.pt'))
    model.to(device)
    model.eval()
    print(predict_sentiment(text, model, tokenizer, vocab, device))

这里是使用训练好的LSTM模型,对This film is terrible!进行预测。结果如下:

('neg', 0.9985383749008179)

总结

具体代码如下:
https://github.com/fxtxz2/geektime-lstm
这里是自定义的LSTM模型,现在的PyTorch已经自带了LSTM模型。

参考:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值