1、LSTM层
Class torch.nn.LSTM(*args,**kwargs)
# pytorch中的输入参数
input_size – 数据的特征维度(使用embedding时就是指embedding dim)
hidden_size – 隐向量的维度
num_layers – LSTM的层数,i层LSTM将使用i-1层LSTM的输出作为输入。默认为1
bias – 特征变换时是否使用bias,默认为True
batch_first – 设置batch_size的位置,当为True时,输入的数据维度应该是 (batch, seq, feature) ,否则应该是 (seq, batch, feature)。默认是False
dropout – 除了最后的输出层外,在每层LSTM的输出后设置一层dropout层,dropout_ratio默认为0(只有num_layers>1,才会起作用)
bidirectional – 是否为双向LSTM,双向LSTM就是同时从句首正向和句尾反向进行LSTM编码,最后将同时刻两个方向得到的hidden state进行concat作为输出,默认为False
proj_size – 对隐层ht加上一层映射,映射维度为设置值,默认为0,也就是不映射。
2、embedding层
在embedding层之前要先将句子转换层token,对于英文来说最常见的方法是通过空格进行划分,但是效果不够好,torchtext中最常用的是使用spacy框架进行划分token。
# 定义了句子和标签的两种划分token的方法,text使用spacy的方法划分,
# 并且在返回token的时候还返回句子划分为token之后的长度,便于后续处理。
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
embedding层使token转换成向量,参与下面神经网络的计算,词向量通常有Skip-Gram和CBOW两种模型,从直观上理解:
- Skip-Gram是给定input word来预测上下文
- 而CBOW是给定上下文,来预测input word
为了加快训练过程有负采样和层级softmax两种训练方法。实际上,通常使用预训练的词向量,这里我们选取GloVe词向量,GloVe的全称是:Global Vectors for Word Representation。
# 在torchtext中可以在创建词库的时候,确定预训练的词向量。
# max_size确定了词库的容量(总量是max_size+2,因为还会多[pad] [unk]
# vectors确定使用哪种词向量
# unk_init确定unkown的token转化为词向量的方法
TEXT.build_vocab(train_data,
max_size = MAX_VOCAB_SIZE,
vectors = "glove.6B.100d",
unk_init = torch.Tensor.normal_)
3、数据迭代器
因为每个句子的长度都不一样,所以数据中seq这一维是不一样的,为了便于计算,通常采用的方法是保证每一个batch中的seq这一维是一样的。
- torchtext中的BucketIterator会尽量将长度相同的句子放在一个batch中,来最小化pad的数量。
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
sort_within_batch = True,
device = device)
4、双向LSTM实现的其他细节
- 对于[pad]的embedding向量可以设置为[0]*embedding_size,并且使其无需更新,这时在embedding层中设置好padding_idx就可以。
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
- 前面提到每一个batch都会对其中部分句子进行pad来保证每一个batch中所有句子的token长度相同,而LSTM的输出是最后一个token对应的hidden state,那如果最后一个是[pad],我们更希望拿到最后一个非pad的token对应的hidden state,这时候我们前面对句子tokenize时返回的句子长度就派上了用场。
- nn.utils.rnn.packed_padded_sequence可以将embedding与length打包后输入lstm中,这样会输出一个output和最后一个非pad对应的hidden state、cell state。
- 对output使用 nn.utils.rnn.pad_packed_sequence进行“解压还可以得到每一个token对应的最后的hidden state 与cell state
#text 的形状 [sent len, batch size]
embedded = self.dropout(self.embedding(text))
#embedded 的形状 [sent len, batch size, emb dim]
# pack sequence
# lengths need to be on CPU!
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
packed_output, (hidden, cell) = self.rnn(packed_embedded)
#hidden 的形状 [num layers * num directions, batch size, hid dim]
#unpack sequence
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
#output的形状[sent len, batch size, hid dim * num directions]
#由于[pad]的embedding vector是0,所以output中的 padding tokens对应的输出是数值为0的张量
- 双向LSTM中要对最后的结果进行concat喂给下面的分类层
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
#and apply dropout
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
#hidden 的形状 [batch size, hid dim * num directions]
4、使用预训练的词向量
在建立完模型之后,需要将embedding层的weight置换成前面glove的词向量。
# 用预训练的embedding词向量替换原始模型初始化的权重参数
model.embedding.weight.data.copy_(pretrained_embeddings)
还需要将[pad]和[unk]的词向量置为0,因为前面对于unk的token是采用随机设置词向量的方法。
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
5、代码demo
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import torch.optim as optim
import spacy
import random
import time
# 设置随机种子
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# 定义torchtext中的field
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
# 下载IMDB数据,并进行切分
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
#建立词库
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data,
max_size = MAX_VOCAB_SIZE,
vectors = "glove.6B.100d",
unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)
# 创建数据迭代器
BATCH_SIZE = 64
# 根据当前环境选择是否调用GPU进行训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
sort_within_batch = True,
device = device)
# 建立双向LSTM模型
class RNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
super().__init__()
# embedding嵌入层(词向量)
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
# 双向LSTM
self.rnn = nn.LSTM(embedding_dim, # input_size
hidden_dim, #output_size
num_layers=n_layers, # 层数
bidirectional=bidirectional, #是否双向
dropout=dropout) #随机去除神经元
# 线性连接层
self.fc = nn.Linear(hidden_dim * 2, output_dim) # 因为前向传播+后向传播有两个hidden sate,且合并在一起,所以乘以2
# 随机去除神经元
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths):
#text 的形状 [sent len, batch size]
embedded = self.dropout(self.embedding(text))
#embedded 的形状 [sent len, batch size, emb dim]
# pack sequence
# lengths need to be on CPU!
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
packed_output, (hidden, cell) = self.rnn(packed_embedded)
#output的形状[sent len, batch size, hid dim * num directions]
#hidden 的形状 [num layers * num directions, batch size, hid dim]
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
#hidden 的形状 [batch size, hid dim * num directions]
return self.fc(hidden)
# 实例化模型
INPUT_DIM = len(TEXT.vocab) # 250002: 之前设置的只取25000个最频繁的词,加上pad_token和unknown token
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] #指定参数,定义pad_token的index索引值,让模型不管pad token
model = RNN(INPUT_DIM,
EMBEDDING_DIM,
HIDDEN_DIM,
OUTPUT_DIM,
N_LAYERS,
BIDIRECTIONAL,
DROPOUT,
PAD_IDX)
# 用预训练的embedding词向量替换原始模型初始化的权重参数
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
#将unknown 和padding token设置为0
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
# 模型优化器与损失函数
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
# 定义二分类准确率函数
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float() #convert into float for division
acc = correct.sum() / len(correct)
return acc
# 定义一个epoch的训练过程
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad() # 梯度清零
text, text_lengths = batch.text # batch.text返回的是一个元组(数字化的张量,每个句子的长度)
predictions = model(text, text_lengths).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 定义评价函数
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
text, text_lengths = batch.text #batch.text返回的是一个元组(数字化的张量,每个句子的长度)
predictions = model(text, text_lengths).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 定义训练时间函数
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
#比进行训练
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
# 保留最好的训练结果的那个模型参数,之后加载这个进行预测
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut2-model.pt')
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
# 保存模型参数
model.load_state_dict(torch.load('tut2-model.pt'))
# 使用测试集评价模型
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
# 定义单句情感分析的函数
def predict_sentiment(model, sentence):
nlp = spacy.load('en_core_web_sm')
model.eval()
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
length = [len(indexed)]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
length_tensor = torch.LongTensor(length)
prediction = torch.sigmoid(model(tensor, length_tensor))
return prediction.item()