pytorch构建LSTM分类器用于IMDB情感分类
本文基于pytorch构建LSTM情感分类分类器,在IMDB数据集上进行测试,涉及文本预处理、数据集加载、模型训练、保存、测试等过程。
一、数据预处理
将IMDB文本评论 数据集下载至本地,该数据为正负样本均衡数据。解压后,可获取有标签的训练数据与测试数据各25000条,正负样本各12500条。首先将数据进行预处理,对单独存储在每个txt文件中训练样本(测试样本)进行去标点符号、转小写等操作,然后合并在一个文档中,其中文档每一行为一个样本,第一个字符表示标签。随后,确定文档的词典,通过只取频率最高的 MAX_WORD 个词存入词典,来过滤低频词。
#清理文本,去标点符号,转小写
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
string = re.sub(r"\s{2,}", " ", string)
string = re.sub(r"sssss ", " ", string)
return string.strip().lower()
# 分词方法
def tokenizer(sentence):
return sentence.split()
# 数据预处理过程
def data_process(text_path, text_dir): # 根据文本路径生成文本的标签
print("data preprocess")
file_pro = open(text_path,'w',encoding='utf-8')
for root, s_dirs, _ in os.walk(text_dir): # 获取 train文件下各文件夹名称
for sub_dir in s_dirs:
i_dir = os.path.join(root, sub_dir) # 获取train和test文件夹下所有的路径
text_list = os.listdir(i_dir)
tag = os.path.split(i_dir)[-1] # 获取标签
if tag == 'pos':
label = '1'
if tag == 'neg':
label = '0'
if tag =='unsup':
continue
for i in range(len(text_list)):
if not text_list[i].endswith('txt'): # 判断若不是txt,则跳过
continue
f = open(os.path.join(i_dir, text_list[i]),'r',encoding='utf-8') # 打开文本
raw_line = f.readline()
pro_line = clean_str(raw_line)
tokens = tokenizer(pro_line) # 分词统计词数
for token in tokens:
if token in word_count.keys():
word_count[token] = word_count[token] + 1
else:
word_count[token] = 0
file_pro.write(label + ' ' + pro_line +'\n')
f.close()
file_pro.flush()
file_pro.close()
print("build vocabulary")
vocab = {"<UNK>": 0, "<PAD>": 1}
word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序,过滤低频词,只取前MAX_WORD个高频词
word_number = 1
for word in word_count_sort:
if word[0] not in vocab.keys():
vocab[word[0]] = len(vocab)
word_number += 1
if word_number > MAX_WORD:
break
return vocab
二、定义Dataset
通过加载已经经过预处理的文本来生成每个样本的文本与标签。
# 定义Dataset
class MyDataset(Dataset):
def __init__(self, text_path):
file = open(text_path, 'r', encoding='utf-8')
self.text_with_tag = file.readlines() # 文本标签与内容
file.close()
def __getitem__(self, index): # 重写getitem
line = self.text_with_tag[index] # 获取一个样本的标签和文本信息
label = int(line[0]) # 标签信息
text = line[2:-1] # 文本信息
return text, label
def __len__(self):
return len(self.text_with_tag)
三、构建LSTM模型
模型的输入为已经构建的词典vocab,词嵌入后的维度dembed_size, LSTM隐藏层数num_hiddens以及层数num_layers。
# 定义LSTM模型
class LSTM(nn.Module):
def __init__(self, vocab, embed_size, num_hiddens, num_layers):
super(LSTM, self).__init__()
self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层
self.encoder = nn.LSTM(input_size=embed_size,
hidden_size=num_hiddens,
num_layers=num_layers,
bidirectional=False)
self.decoder = nn.Linear(num_hiddens, 2)
self.softmax = nn.Softmax(dim=1)
def forward(self, inputs):
# inputs的形状是(批量大小,词数),因此LSTM需要将序列长度(Seq_len)作为第一维,所以将输入转置后 再提取词特征
embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度
# LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态
# outputs的形状是(词数,批量大小, 隐藏单元个数)
outputs, _ = self.encoder(embeddings)
# 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小, 隐藏单元个数)
encoding = outputs[-1] # 取LSTM最后一层结果
outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b]
return outs
四、定义模型训练与测试方法
对于通过pytorch dataloader加载后一个batch的样本,其数据string类型的文本数据,需要定义一个text_transform方法将其转为等长(MAX_LEN)的词索引向量。对于长度不够MAX_LEN的文本,利用进行填充,过长的文本则进行截断。
def train(model, train_data, vocab, epoch=10):
print('train model')
model = model.to(device)
loss_sigma = 0.0
correct = 0.0
# 定义损失函数和优化器
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
for epoch in tqdm(range(epoch)):
model.train()
avg_loss = 0 # 平均损失
avg_acc = 0 # 平均准确率
for idx, (text, label) in enumerate(tqdm(train_data)):
train_x = text_transform(text, vocab).to(device)
train_y = label.to(device)
optimizer.zero_grad()
pred = model(train_x)
loss = criterion(pred.log(), train_y)
loss.backward()
optimizer.step()
avg_loss += loss.item()
avg_acc += accuracy(pred, train_y)
# 一个epoch结束后,计算平均loss和评平均acc
avg_loss = avg_loss / len(train_data)
avg_acc = avg_acc / len(train_data)
print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)
# 保存训练完成后的模型参数
torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')
# 根据vocab将句子转为定长MAX_LEN的tensor
def text_transform(sentence_list, vocab):
sentence_index_list = []
for sentence in sentence_list:
sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分词转为id
if len(sentence_idx) < MAX_LEN:
for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充
sentence_idx.append(vocab['<PAD>'])
sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度
sentence_index_list.append(sentence_idx)
return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor
# 模型测试
def test(model, test_data, vocab):
print('test model')
model = model.to(device)
model.eval()
avg_acc = 0
for idx, (text, label) in enumerate(tqdm(test_data)):
train_x = text_transform(text, vocab).to(device)
train_y = label.to(device)
pred = model(train_x)
avg_acc += accuracy(pred, train_y)
avg_acc = avg_acc / len(test_data)
return avg_acc
# 计算预测准确性
def accuracy(y_pred, y_true):
label_pred = y_pred.max(dim=1)[1]
acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数
return acc.detach().cpu().numpy() / len(y_pred)
五、定义main方法
通过main方法调用上述所定义的方法,完成数据处理、模型构建、模型、保存、加载、测试等过程。
def main():
train_dir = './aclImdb/train' # 原训练集文件地址
train_path = './train.txt' # 预处理后的训练集文件地址
test_dir = './aclImdb/test' # 原训练集文件地址
test_path = './test.txt' # 预处理后的训练集文件地址
vocab = data_process(train_path, train_dir) # 数据预处理
# data_process(test_path, test_dir)
np.save('vocab.npy', vocab) # 词典保存为本地
vocab = np.load('vocab.npy', allow_pickle=True).item() # 加载本地已经存储的vocab
# 构建MyDataset实例
train_data = MyDataset(text_path=train_path)
test_data = MyDataset(text_path=test_path)
# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=256, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)
# 生成模型
model = LSTM(vocab=vocab, embed_size=300, num_hiddens=128, num_layers=2) # 定义模型
train(model=model, train_data=train_loader, vocab=vocab, epoch=1)
# 加载训练好的模型
model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location=torch.device('cpu')))
# 测试结果
acc = test(model=model, test_data=test_loader, vocab=vocab)
print(acc)
完整代码
import torch # torch==1.7.1
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os
import re
import numpy as np
from tqdm import tqdm
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
MAX_WORD = 10000 # 只保留最高频的10000词
MAX_LEN = 300 # 句子统一长度为200
word_count={} # 词-词出现的词数 词典
#清理文本,去标点符号,转小写
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
string = re.sub(r"\s{2,}", " ", string)
string = re.sub(r"sssss ", " ", string)
return string.strip().lower()
# 分词方法
def tokenizer(sentence):
return sentence.split()
# 数据预处理过程
def data_process(text_path, text_dir): # 根据文本路径生成文本的标签
print("data preprocess")
file_pro = open(text_path,'w',encoding='utf-8')
for root, s_dirs, _ in os.walk(text_dir): # 获取 train文件下各文件夹名称
for sub_dir in s_dirs:
i_dir = os.path.join(root, sub_dir) # 获取train和test文件夹下所有的路径
text_list = os.listdir(i_dir)
tag = os.path.split(i_dir)[-1] # 获取标签
if tag == 'pos':
label = '1'
if tag == 'neg':
label = '0'
if tag =='unsup':
continue
for i in range(len(text_list)):
if not text_list[i].endswith('txt'): # 判断若不是txt,则跳过
continue
f = open(os.path.join(i_dir, text_list[i]),'r',encoding='utf-8') # 打开文本
raw_line = f.readline()
pro_line = clean_str(raw_line)
tokens = tokenizer(pro_line) # 分词统计词数
for token in tokens:
if token in word_count.keys():
word_count[token] = word_count[token] + 1
else:
word_count[token] = 0
file_pro.write(label + ' ' + pro_line +'\n')
f.close()
file_pro.flush()
file_pro.close()
print("build vocabulary")
vocab = {"<UNK>": 0, "<PAD>": 1}
word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序,过滤低频词,只取前MAX_WORD个高频词
word_number = 1
for word in word_count_sort:
if word[0] not in vocab.keys():
vocab[word[0]] = len(vocab)
word_number += 1
if word_number > MAX_WORD:
break
return vocab
# 定义Dataset
class MyDataset(Dataset):
def __init__(self, text_path):
file = open(text_path, 'r', encoding='utf-8')
self.text_with_tag = file.readlines() # 文本标签与内容
file.close()
def __getitem__(self, index): # 重写getitem
line = self.text_with_tag[index] # 获取一个样本的标签和文本信息
label = int(line[0]) # 标签信息
text = line[2:-1] # 文本信息
return text, label
def __len__(self):
return len(self.text_with_tag)
# 根据vocab将句子转为定长MAX_LEN的tensor
def text_transform(sentence_list, vocab):
sentence_index_list = []
for sentence in sentence_list:
sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分词转为id
if len(sentence_idx) < MAX_LEN:
for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充
sentence_idx.append(vocab['<PAD>'])
sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度
sentence_index_list.append(sentence_idx)
return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor
# 定义LSTM模型
class LSTM(nn.Module):
def __init__(self, vocab, embed_size, num_hiddens, num_layers):
super(LSTM, self).__init__()
self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层
self.encoder = nn.LSTM(input_size=embed_size,
hidden_size=num_hiddens,
num_layers=num_layers,
bidirectional=False)
self.decoder = nn.Linear(num_hiddens, 2)
self.softmax = nn.Softmax(dim=1)
def forward(self, inputs):
# inputs的形状是(批量大小,词数),因此LSTM需要将序列长度(Seq_len)作为第一维,所以将输入转置后 再提取词特征
embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度
# LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态
# outputs的形状是(词数,批量大小, 隐藏单元个数)
outputs, _ = self.encoder(embeddings)
# 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小, 隐藏单元个数)
encoding = outputs[-1] # 取LSTM最后一层结果
outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b]
return outs
# 模型训练
def train(model, train_data, vocab, epoch=10):
print('train model')
model = model.to(device)
loss_sigma = 0.0
correct = 0.0
# 定义损失函数和优化器
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
for epoch in tqdm(range(epoch)):
model.train()
avg_loss = 0 # 平均损失
avg_acc = 0 # 平均准确率
for idx, (text, label) in enumerate(tqdm(train_data)):
train_x = text_transform(text, vocab).to(device)
train_y = label.to(device)
optimizer.zero_grad()
pred = model(train_x)
loss = criterion(pred.log(), train_y)
loss.backward()
optimizer.step()
avg_loss += loss.item()
avg_acc += accuracy(pred, train_y)
# 一个epoch结束后,计算平均loss和评平均acc
avg_loss = avg_loss / len(train_data)
avg_acc = avg_acc / len(train_data)
print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)
# 保存训练完成后的模型参数
torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')
# 模型测试
def test(model, test_data, vocab):
print('test model')
model = model.to(device)
model.eval()
avg_acc = 0
for idx, (text, label) in enumerate(tqdm(test_data)):
train_x = text_transform(text, vocab).to(device)
train_y = label.to(device)
pred = model(train_x)
avg_acc += accuracy(pred, train_y)
avg_acc = avg_acc / len(test_data)
return avg_acc
# 计算预测准确性
def accuracy(y_pred, y_true):
label_pred = y_pred.max(dim=1)[1]
acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数
return acc.detach().cpu().numpy() / len(y_pred)
def main():
train_dir = './aclImdb/train' # 原训练集文件地址
train_path = './train.txt' # 预处理后的训练集文件地址
test_dir = './aclImdb/test' # 原训练集文件地址
test_path = './test.txt' # 预处理后的训练集文件地址
vocab = data_process(train_path, train_dir) # 数据预处理
# data_process(test_path, test_dir)
np.save('vocab.npy', vocab) # 词典保存为本地
vocab = np.load('vocab.npy', allow_pickle=True).item() # 加载本地已经存储的vocab
# 构建MyDataset实例
train_data = MyDataset(text_path=train_path)
test_data = MyDataset(text_path=test_path)
# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=256, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)
# 生成模型
model = LSTM(vocab=vocab, embed_size=300, num_hiddens=128, num_layers=2) # 定义模型
train(model=model, train_data=train_loader, vocab=vocab, epoch=1)
# 加载训练好的模型
model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location=torch.device('cpu')))
# 测试结果
acc = test(model=model, test_data=test_loader, vocab=vocab)
print(acc)
if __name__ == '__main__':
main()