基于BiLSTM的酒店评论文本情感分析

最新推荐文章于 2024-04-28 20:07:07 发布

人工智能（篮球方向）

最新推荐文章于 2024-04-28 20:07:07 发布

阅读量3.9k

点赞数 9

文章标签： NLP 深度学习 Python 文本情感分析

本文链接：https://blog.csdn.net/weixin_42386003/article/details/107266257

版权

介绍：（需要有部分机器学习和深度学习，还有NLP的理论知识）

使用深度学习框架TensorFlow中的keras接口实现BiLstm神经网络，用训练集训练出一个模型，可以判断酒店评论文本的正负向情绪。
数据集使用了8886条酒店评论，其中正向评论有4443条，负向评论有4443条。正负向评论都打好了标签，正向标1，负向标0。
该模型的embedding用了word2vec预训练模型，是中科大的一个开源的词向量模型。
模型的准确率达到97%。
博主是在juypter notebook上执行的。

源码以及操作：（配置好环境和设置好路径可以直接运行）

#导包：

# 首先加载必用的库
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re
import jieba # 结巴分词
# gensim用来加载预训练word vector
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")
# 用来解压
import bz2
import csv
from tensorflow.python.keras import metrics
import tensorflow as tf

#导入预训练模型，需先下载预训练模型放入指定目录，导入时需要等待几分钟到十几分钟不等。

with open("embeddings/sgns.zhihu.bigram", 'wb') as new_file, open("embeddings/sgns.zhihu.bigram.bz2", 'rb') as file:
decompressor = bz2.BZ2Decompressor()
for data in iter(lambda : file.read(100 * 1024), b''):
new_file.write(decompressor.decompress(data))

cn_model = KeyedVectors.load_word2vec_format('embeddings/sgns.zhihu.bigram',
binary=False, unicode_errors="ignore")

#读入数据集，分好训练样本和训练标签

train_texts_orig = []
# 文本所对应的labels, 也就是标记
train_target = []

with open("negative_samples.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
dic = eval(line)
train_texts_orig.append(dic["text"])
train_target.append(dic["label"])

filename = 'ChnSentiCorp_htl_all.txt'
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",line)
p=(int)(line[0])
train_target.append(p)
line= line.replace(line[0],'')
train_texts_orig.append(line)

with open("positive_samples.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
dic = eval(line)
train_texts_orig.append(dic["text"])
train_target.append(dic["label"])

#使用TensorFlow接口

# 我们使用tensorflow的keras接口来建立模型
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
import tensorflow as tf

#进行分词和tokenize
#train_tokens是一个长长的list，其中含有8886个小list，对应每一条评价
train_tokens = []
for text in train_texts_orig:
# 去掉标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
# 结巴分词
cut = jieba.cut(text)
# 结巴分词的输出结果为一个生成器
# 把生成器转换为list
cut_list = [ i for i in cut ]
for i, word in enumerate(cut_list):
try:
# 将词转换为索引index
cut_list[i] = cn_model.vocab[word].index
except KeyError:
# 如果词不在字典中，则输出0
cut_list[i] = 0
train_tokens.append(cut_list)
# 进行分词和索引化

# 获得所有tokens的长度
num_tokens = [ len(tokens) for tokens in train_tokens ]
num_tokens = np.array(num_tokens)

# 获取最长的评价tokens的长度
np.max(num_tokens)

# 平均tokens的长度
np.mean(num_tokens)

# 取tokens平均值并加上两个tokens的标准差，
# 假设tokens长度的分布为正态分布，则max_tokens这个值可以涵盖95%左右的样本
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

# 取tokens的长度为236时，大约95%的样本被涵盖
# 我们对长度不足的进行padding，超长的进行修剪
np.sum( num_tokens < max_tokens ) / len(num_tokens)
#for i in train_tokens[0]:
#print(i)

#定义由标签返回原文本的函数，方便Debug

def reverse_tokens(tokens):
text = ''
for i in tokens:
if i != 0:
text = text + cn_model.index2word[i]
else:
text = text + ' '
return text

#测试函数

reverse = reverse_tokens(train_tokens[0])

print(reverse)
print(train_tokens[0])

reverse

# 可见每一个词都对应一个长度为300的向量
embedding_dim = cn_model['山东大学'].shape[0]
print('词向量的长度为{}'.format(embedding_dim))
cn_model['山东大学']
# 只使用前20000个词
num_words = 50000
# 初始化embedding_matrix，之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words，embedding_dim] 的矩阵
# 维度为 50000 * 300
for i in range(num_words):
embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]

embedding_matrix = embedding_matrix.astype('float32')

# 返回的train_pad是一个numpy array
train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
padding='pre', truncating='pre')

# 超出五万个词向量的词用0代替
train_pad[ train_pad>=num_words ] = 0

# 准备target向量，前4448样本为1，后4448为0
train_target = np.array(train_target)
print(train_target)

# 训练和测试样本的分割
from sklearn.model_selection import train_test_split
#80%的样本用来训练，20%用来测试
X_train, X_test, y_train, y_test = train_test_split(train_pad,
train_target,
test_size=0.2,
random_state=12)
# 用LSTM对样本进行分类
model = Sequential()
# 模型第一层为embedding
model.add(Embedding(num_words,
embedding_dim,
weights=[embedding_matrix],
trainable=False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True,dropout=0.2)))
model.add(LSTM(units=16, return_sequences=False))

model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
optimizer='Adam',
metrics=[tf.keras.metrics.BinaryAccuracy()])

# 查看神经网络结构
model.summary()

# 建立一个权重的存储点
path_checkpoint = 'sentiment_checkpoint_demo.keras'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
verbose=1, save_weights_only=True,
save_best_only=True)

# 尝试加载已训练模型
try:
model.load_weights(path_checkpoint)
except Exception as e:
print(e)

# 定义early stoping如果3个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss',
factor=0.1, min_lr=1e-8, patience=0,
verbose=1)

# 定义callback函数
callbacks = [
earlystopping,
checkpoint,
lr_reduction
]

# 开始训练
model.fit(X_train, y_train,
validation_split=0.2,
epochs=2,
batch_size=256,
callbacks=callbacks)

#评估模型误差和准确率

result = model.evaluate(X_test, y_test)

print('准确率:','%.2f'%result[1])

#使用模型，模拟实际使用场景，预测自定义的酒店评论文本

def predict_sentiment(text):
print(text)
# 去标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
# 分词
cut = jieba.cut(text)
cut_list = [ i for i in cut ]
# tokenize
for i, word in enumerate(cut_list):
try:
cut_list[i] = cn_model.vocab[word].index
if cut_list[i] >= 50000:
cut_list[i] = 0
except KeyError:
cut_list[i] = 0
# padding
tokens_pad = pad_sequences([cut_list], maxlen=max_tokens,
padding='pre', truncating='pre')
# 预测
result = model.predict(x=tokens_pad)
coef = result[0][0]
#if coef>=0.5激活函数是sigmoid时
if coef >= 0.5:
print('是一例正面评价','output=%.2f'%coef)
else:
print('是一例负面评价','output=%.2f'%coef)

test_list = [
'酒店设施太旧，而且前台态度也太差了',
'酒店的床铺很软，很干净',
'酒店的卫生不好，有蟑螂',
'房间空调坏了，热死了',
'房间窗户很大，通风好',
'酒店周围环境不好，周围好吵闹',
'房间的隔音不好，早上醒来还漏水' ,
'酒店的采光不好',
'酒店没有标明价格，临时加价，差评！！！',
'酒店怎么评价呢，中等吧',
'酒店有些方面很好，有些方面很差',
'房间有监控，可怕',
'酒店还行吧，一般',
'酒店房间很大，采光很好',
'这算是酒店吗？房间不像房间，连个像样的洗手间都没有，什么垃圾玩意？就这？这价格也是很贵，不说了，这种酒店也能开？',
'6666'
]
for text in test_list:
predict_sentiment(text)

预测结果如图：
另外附上模型准确率图：

PS:需要数据集和预训练模型的请留下邮箱。

人工智能（篮球方向）

关注

9
点赞
踩
63

收藏

觉得还不错? 一键收藏
85
评论
基于BiLSTM的酒店评论文本情感分析

介绍：（需要有部分机器学习和深度学习，还有NLP的理论知识）使用深度学习框架TensorFlow中的keras接口实现BiLstm神经网络，用训练集训练出一个模型，可以判断酒店评论文本的正负向情绪。数据集使用了8886条酒店评论，其中正向评论有4443条，负向评论有4443条。正负向评论都打好了标签，正向标1，负向标0。该模型的embedding用了word2vec预训练模型，是中科大的一个开源的词向量模型。模型的准确率达到97%。博主是在juypter notebook上执行的。源码
复制链接

扫一扫