介绍:(需要有部分机器学习和深度学习,还有NLP的理论知识)
- 使用深度学习框架TensorFlow中的keras接口实现BiLstm神经网络,用训练集训练出一个模型,可以判断酒店评论文本的正负向情绪。
- 数据集使用了8886条酒店评论,其中正向评论有4443条,负向评论有4443条。正负向评论都打好了标签,正向标1,负向标0。
- 该模型的embedding用了word2vec预训练模型,是中科大的一个开源的词向量模型。
- 模型的准确率达到97%。
- 博主是在juypter notebook上执行的。
源码以及操作:(配置好环境和设置好路径可以直接运行)
#导包:
# 首先加载必用的库
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re
import jieba # 结巴分词
# gensim用来加载预训练word vector
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")
# 用来解压
import bz2
import csv
from tensorflow.python.keras import metrics
import tensorflow as tf
#导入预训练模型,需先下载预训练模型放入指定目录,导入时需要等待几分钟到十几分钟不等。
with open("embeddings/sgns.zhihu.bigram", 'wb') as new_file, open("embeddings/sgns.zhihu.bigram.bz2", 'rb') as file:
decompressor = bz2.BZ2Decompressor()
for data in iter(lambda : file.read(100 * 1024), b''):
new_file.write(decompressor.decompress(data))
cn_model = KeyedVectors.load_word2vec_format('embeddings/sgns.zhihu.bigram',
binary=False, unicode_errors="ignore")
#读入数据集,分好训练样本和训练标签
train_texts_orig = []
# 文本所对应的labels, 也就是标记
train_target = []
with open("negative_samples.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
dic = eval(line)
train_texts_orig.append(dic["text"])
train_target.append(dic["label"])
filename = 'ChnSentiCorp_htl_all.txt'
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",line)
p=(int)(line[0])
train_target.append(p)
line= line.replace(line[0],'')
train_texts_orig.append(line)
with open("positive_samples.txt", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
dic = eval(line)
train_texts_orig.append(dic["text"])
train_target.append(dic["label"])
#使用TensorFlow接口
# 我们使用tensorflow的keras接口来建立模型
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
import tensorflow as tf
#进行分词和tokenize
#train_tokens是一个长长的list,其中含有8886个小list,对应每一条评价
train_tokens = []
for text in train_texts_orig:
# 去掉标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",text)
# 结巴分词
cut = jieba.cut(text)
# 结巴分词的输出结果为一个生成器
# 把生成器转换为list
cut_list = [ i for i in cut ]
for i, word in enumerate(cut_list):
try:
# 将词转换为索引index
cut_list[i] = cn_model.vocab[word].index
except KeyError:
# 如果词不在字典中,则输出0
cut_list[i] = 0
train_tokens.append(cut_list)
# 进行分词和索引化
# 获得所有tokens的长度
num_tokens = [ len(tokens) for tokens in train_tokens ]
num_tokens = np.array(num_tokens)
# 获取最长的评价tokens的长度
np.max(num_tokens)
# 平均tokens的长度
np.mean(num_tokens)
# 取tokens平均值并加上两个tokens的标准差,
# 假设tokens长度的分布为正态分布,则max_tokens这个值可以涵盖95%左右的样本
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens
# 取tokens的长度为236时,大约95%的样本被涵盖
# 我们对长度不足的进行padding,超长的进行修剪
np.sum( num_tokens < max_tokens ) / len(num_tokens)
#for i in train_tokens[0]:
#print(i)
#定义由标签返回原文本的函数,方便Debug
def reverse_tokens(tokens):
text = ''
for i in tokens:
if i != 0:
text = text + cn_model.index2word[i]
else:
text = text + ' '
return text
#测试函数
reverse = reverse_tokens(train_tokens[0])
print(reverse)
print(train_tokens[0])
reverse
# 可见每一个词都对应一个长度为300的向量
embedding_dim = cn_model['山东大学'].shape[0]
print('词向量的长度为{}'.format(embedding_dim))
cn_model['山东大学']
# 只使用前20000个词
num_words = 50000
# 初始化embedding_matrix,之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words,embedding_dim] 的矩阵
# 维度为 50000 * 300
for i in range(num_words):
embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]
embedding_matrix = embedding_matrix.astype('float32')
# 返回的train_pad是一个numpy array
train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
padding='pre', truncating='pre')
# 超出五万个词向量的词用0代替
train_pad[ train_pad>=num_words ] = 0
# 准备target向量,前4448样本为1,后4448为0
train_target = np.array(train_target)
print(train_target)
# 训练和测试样本的分割
from sklearn.model_selection import train_test_split
#80%的样本用来训练,20%用来测试
X_train, X_test, y_train, y_test = train_test_split(train_pad,
train_target,
test_size=0.2,
random_state=12)
# 用LSTM对样本进行分类
model = Sequential()
# 模型第一层为embedding
model.add(Embedding(num_words,
embedding_dim,
weights=[embedding_matrix],
trainable=False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True,dropout=0.2)))
model.add(LSTM(units=16, return_sequences=False))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy',
optimizer='Adam',
metrics=[tf.keras.metrics.BinaryAccuracy()])
# 查看神经网络结构
model.summary()
# 建立一个权重的存储点
path_checkpoint = 'sentiment_checkpoint_demo.keras'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
verbose=1, save_weights_only=True,
save_best_only=True)
# 尝试加载已训练模型
try:
model.load_weights(path_checkpoint)
except Exception as e:
print(e)
# 定义early stoping如果3个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss',
factor=0.1, min_lr=1e-8, patience=0,
verbose=1)
# 定义callback函数
callbacks = [
earlystopping,
checkpoint,
lr_reduction
]
# 开始训练
model.fit(X_train, y_train,
validation_split=0.2,
epochs=2,
batch_size=256,
callbacks=callbacks)
#评估模型误差和准确率
result = model.evaluate(X_test, y_test)
print('准确率:','%.2f'%result[1])
#使用模型,模拟实际使用场景,预测自定义的酒店评论文本
def predict_sentiment(text):
print(text)
# 去标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",text)
# 分词
cut = jieba.cut(text)
cut_list = [ i for i in cut ]
# tokenize
for i, word in enumerate(cut_list):
try:
cut_list[i] = cn_model.vocab[word].index
if cut_list[i] >= 50000:
cut_list[i] = 0
except KeyError:
cut_list[i] = 0
# padding
tokens_pad = pad_sequences([cut_list], maxlen=max_tokens,
padding='pre', truncating='pre')
# 预测
result = model.predict(x=tokens_pad)
coef = result[0][0]
#if coef>=0.5激活函数是sigmoid时
if coef >= 0.5:
print('是一例正面评价','output=%.2f'%coef)
else:
print('是一例负面评价','output=%.2f'%coef)
test_list = [
'酒店设施太旧,而且前台态度也太差了',
'酒店的床铺很软,很干净',
'酒店的卫生不好,有蟑螂',
'房间空调坏了,热死了',
'房间窗户很大,通风好',
'酒店周围环境不好,周围好吵闹',
'房间的隔音不好,早上醒来还漏水' ,
'酒店的采光不好',
'酒店没有标明价格,临时加价,差评!!!',
'酒店怎么评价呢,中等吧',
'酒店有些方面很好,有些方面很差',
'房间有监控,可怕',
'酒店还行吧,一般',
'酒店房间很大,采光很好',
'这算是酒店吗?房间不像房间,连个像样的洗手间都没有,什么垃圾玩意?就这?这价格也是很贵,不说了,这种酒店也能开?',
'6666'
]
for text in test_list:
predict_sentiment(text)
- 预测结果如图:
- 另外附上模型准确率图: