一、文章概述
本文主要介绍了通过双向LSTM实现NLP文本情感分类。对比单向LSTM使用BiLSTM能更好的捕捉上下文的信息。提升模型的准确度。实现的步骤主要分为以下步骤:
1、文本预处理(分词、去除停用词等)
2、构建Embedding层
3、构建模型BiLSTM模型
4、模型训练
二、模型效果对比
三、步骤实现过程
(1)加载数据
import numpy as np
import pandas as pd
data = pd.read_excel('.\data\data_test_train.xlsx')
data.head()
(2)文本预处理
# coding=utf-8
import jieba, re
#英文文本处理需要用到的库
import nltk
nltk.download('stopwords') #加载停用词
from nltk.tokenize import word_tokenize #英文的分词工具
from nltk.corpus import stopwords #停用词需要使用到
import string #用于去除英文语句中所有的标点符号
#---------------------------------------------------(此部分是中英文词处理都可以公用的)
# 去除原始字符串中的url
def remove_urls(raw_sentence):
# 正则表达式
url_reg = r'[a-z]*[:.]+\S+'
result = re.sub(url_reg, '', raw_sentence)
return result
# 去除原始字符串中的emoji字符
def remove_emoji(raw_sentence):
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub('', raw_sentence)
return result
#------------------------------------------------------------------(主要是处理中文文本数据的封装函数)
# 创建停用词表
def stopwordslist(path): #输入停用词表的路径
stopwords = [line.strip() for line in open(path, encoding='GBK').readlines()]
return stopwords
# 利用jieba分词对文档进行中文分词
def seg_depart_Chiness(raw_sentence,path):
sentence_depart = jieba.cut(raw_sentence.strip().replace(" ", ""))
stopwords = stopwordslist(path)
outstr_list = []
for word in sentence_depart:
if word not in stopwords:
outstr_list.append(word)
return outstr_list
#--------------------------------------------------------------------(处理英文文本函数)
#对英文进行分词
def seg_depart_English(raw_sentence):
# 使用 string.punctuation 生成标点符号转换表,'' 表示转换后的结果是移除
translator = str.maketrans('', '', string.punctuation)
# 移除字符串中的所有标点符号
raw_sentence = raw_sentence.translate(translator)
#下面进行分词和去除停用词
sentence_depart = word_tokenize(raw_sentence) #对英文文本进行分词
stop_words = set(stopwords.words('english')) #加载英文停用词表
outstr_list = [w.lower() for w in sentence_depart if not w.lower() in stop_words]
return outstr_list
调用封装好的函数进行数据处理。
data['comment'] = data['comment'].apply(remove_urls)
data['comment'] = data['comment'].apply(remove_emoji)
stop_words_file = '哈工大停用词表.txt'
data['comment'] = data['comment'].map(lambda x:seg_depart_Chiness(x,stop_words_file))
(3)Embedding层构建
import torch
from gensim.models import KeyedVectors
class DataProcess:
def __init__(self,sentences,sen_len,w2_path):
self.sentences = sentences #句子列表
self.sen_len = sen_len #句子的最大长度
self.w2_path = w2_path #word2vec模型路径
self.index2word = [] #实现index到word转换
self.word2index = {} #实现word到index转换
self.embedding_matrix = []
#加载预训练好的word2vec
self.embedding = KeyedVectors.load_word2vec_format(self.w2_path, binary=False)
self.embedding_dim = self.embedding.vector_size
def make_embedding(self):
#对预训练的word2vec模型词向量构建,word2index,index2word和embedding
for i,word in enumerate(self.embedding.key_to_index):
if i%1000==0:
print(f'ger word {i}')
self.word2index[word] = len(self.word2index)
self.index2word.append(word)
self.embedding_matrix.append(self.embedding[word])
self.embedding_matrix = torch.tensor(self.embedding_matrix)
#将'<PAD>'和'<UK>'加进入embedding里面
self.add_embedding('<PAD>')
self.add_embedding('<UNK>')
return self.embedding_matrix
def add_embedding(self,word):
#将新词加入embedding中
vector = torch.empty(1,self.embedding_dim)
torch.nn.init.uniform_(vector) #随机初始化向量
self.word2index[word] = len(self.word2index)
self.index2word.append(word)
self.embedding_matrix = torch.cat([self.embedding_matrix,vector],0)
def sentence_word2idx(self):
sentence_list = []
for i,sentence in enumerate(self.sentences):
sentence_index = [] #存储的是每一句对应的词向量索引
for word in sentence:
if word in self.word2index.keys():
#如果单词在字典中直接获取对应的index
sentence_index.append(self.word2index[word])
else:
#否则赋予<UNK>
sentence_index.append(self.word2index['<UNK>'])
#统一句子长度
sentence_index = self.pad_sequence(sentence_index)
sentence_list.append(sentence_index)
return torch.LongTensor(sentence_list)
def pad_sequence(self,sentences):
#统一句子长度
if len(sentences) > self.sen_len:
sentences = sentences[:self.sen_len]
else:
pad_len = self.sen_len - len(sentences)
for _ in range(pad_len):
sentences.append(self.word2index['<PAD>']) #不够长的用空格补齐
return sentences
def labels2tensor(self,y):
y = [int(label) for label in y]
return torch.LongTensor(y)
(4)BiLSTM模型构建
import torch
import torch.nn.functional as F
class BiLSTMModel(torch.nn.Module):
def __init__(self,embedding_matrix,embedding_dim,hidden_dim,num_layers,dropout,requires_grad=True):
super(BiLSTMModel,self).__init__()
#将处理好的embedding_matrix载入torch.nn.embedding层中
self.embedding = torch.nn.Embedding(embedding_matrix.size(0),embedding_matrix.size(1))
self.embedding.weight = torch.nn.Parameter(embedding_matrix,requires_grad=requires_grad)
self.LSTM = torch.nn.LSTM(embedding_dim,hidden_dim,
num_layers=num_layers,
bidirectional=True, #开启双向LSTM
batch_first=True)
self.classifier = torch.nn.Sequential(
torch.nn.Dropout(dropout),
torch.nn.Linear(hidden_dim*2,hidden_dim),
torch.nn.Linear(hidden_dim,1),
torch.nn.Sigmoid()
)
def forward(self,inputs):
inputs = self.embedding(inputs)
output, (h_n, c_n) = self.LSTM(inputs) #x表示输入的数据集(特征)
#获取双向LSTM的最后一个正向和反向的隐藏状态
out = torch.concat([h_n[-1,:,:], h_n[-2, :, :]], dim=-1)
out = self.classifier(out)
return out
(5)封装数据集
#封装数据集
from torch.utils import data
class TwitterDataser(data.Dataset):
def __init__(self,x,y):
self.data = x
self.label = y
def __getitem__(self,index):
if self.label is None:
return self.data[index]
return self.data[index],self.label[index]
def __len__(self):
return len(self.data)
(6)定义模型评估函数
#定义评估函数
from sklearn.metrics import roc_auc_score,accuracy_score
class evaluate:
def __init__(self,y_true,y_pre):
self.y_true = torch.concat((y_true,)).flatten().detach()
self.y_pre = torch.concat((y_pre,)).flatten().detach()
self.y_pre[self.y_pre >= 0.5] = 1
self.y_pre[self.y_pre < 0.5] = 0
def evalution(self):
correct = torch.sum(torch.eq(self.y_pre,self.y_true)).item()
return correct
def acc(self):
return accuracy_score(self.y_true,self.y_pre)
def auc(self):
return roc_auc_score(self.y_true,self.y_pre)
(7)模型训练与验证函数构建
import torch
from torch import nn
import torch.optim as optim
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def train(train_loader,model,criterion,optimizer,epoch):
#将model的模式设定为train,这样optimizer就可以更新model的参数
model.train()
train_len = len(train_loader)
total_loss,total_acc = 0,0
for i,(inputs,labels) in enumerate(train_loader):
#有GPU的话,把数据放到GPU上
inputs = inputs.to(device,dtype=torch.long)
labels = labels.to(device,dtype=torch.float) #类型为float
#1、清空梯度
optimizer.zero_grad()
#2、计算输出
outputs = model(inputs)
outputs = outputs.squeeze() #去掉最外面的维度,向量扁平化
#3、计算损失
loss = criterion(outputs,labels)
total_loss += loss.item()
#4、预测结果与准确率计算
correct = evaluate(labels,outputs).evalution()
total_acc += (correct / batch_size)
#5、反向传播
loss.backward()
#6、更新梯度
optimizer.step()
print("[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f}".
format(epoch + 1, i + 1, train_len, loss.item(), correct * 100 / batch_size), end='\r')
print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss / train_len, total_acc / train_len * 100))
def validate(val_loader,model,criterion):
model.eval() #将model的模式设置为eval,固定model的参数
val_len = len(val_loader)
with torch.no_grad():
total_loss,total_acc = 0,0
for i,(inputs,labels) in enumerate(val_loader):
#1、放到GPU上
inputs = inputs.to(device,dtype=torch.long)
labels = labels.to(device,dtype=torch.float)
#2、计算输出
outputs = model(inputs)
outputs = outputs.squeeze()
#3、计算损失
loss = criterion(outputs,labels)
total_loss += loss.item()
#预测结果
correct = evaluate(labels,outputs).evalution()
total_acc += (correct/batch_size)
print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss / val_len, total_acc / val_len * 100))
print('-----------------------------------------------')
return total_acc/val_len * 100
(8)主函数,调用封装好的函数进行模型训练与保存
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
model_dir = './model'
w2v_path = './model/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt'
#定义embedding是否被更新
requires_grad = False
sen_len = 100
batch_size = 32
epochs = 10
lr = 0.001
def main():
#remove stepword
data = pd.read_excel('.\data\data_test_train.xlsx')
data['comment'] = data['comment'].apply(remove_urls)
data['comment'] = data['comment'].apply(remove_emoji)
stop_words_file = '哈工大停用词表.txt'
data['comment'] = data['comment'].map(lambda x:seg_depart_Chiness(x,stop_words_file))
data_x,data_y = data['comment'],data['sentiment']
#data pre_processing
preprocess = DataProcess(data_x,sen_len,w2_path=w2v_path)
embeddig = preprocess.make_embedding()
data_x = preprocess.sentence_word2idx()
data_y = preprocess.labels2tensor(data_y)
#划分数据集
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2, random_state=5)
#构造Dataset
train_dataset = TwitterDataser(x_train,y_train)
val_dataset = TwitterDataser(x_test,y_test)
#创建数据批量加载器
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=False)
val_loader = DataLoader(dataset=val_dataset,batch_size=batch_size,shuffle=False)
#BiLSTM
model = BiLSTMModel(
embeddig,
embedding_dim=100,
hidden_dim=128,
num_layers=1,
dropout=0.2,
requires_grad=requires_grad
).to(device)
# 返回model中的参数的总数目
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
#定义常见的二分类损失函数
criterion = nn.BCELoss()
#使用Adam优化器
optimizer = optim.Adam(model.parameters(),lr=lr)
best_acc = 0
for epoch in range(epochs):
train(train_loader,model,criterion,optimizer,epoch)
total_acc = validate(val_loader,model,criterion)
if total_acc > best_acc:
#如果validation的结果好于之前所有的结果,就把当下模型进行保存
best_acc = total_acc
torch.save(model,'{}/ckpt.model'.format(model_dir))
print('saving model with acc {:.3f}'.format(total_acc))
四、完整资源包获取方式
请备注需要的资源添加好友,谢谢!