用RNN模型实现文本情感分析
直接上代码吧!!!
# encoding:utf-8
from meinheld import server
from predict_rnn import RnnModel
import time
import logging
from logging.handlers import TimedRotatingFileHandler
import json
from flask import Flask, request
app = Flask(__name__)
def setLog():
log_fmt = '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s'
formatter = logging.Formatter(log_fmt)
fh = TimedRotatingFileHandler(
filename="log/run_textPredict_server" + str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) + ".log",
when="H", interval=1,
backupCount=72)
fh.setFormatter(formatter)
logging.basicConfig(level=logging.INFO)
log = logging.getLogger()
log.addHandler(fh)
setLog()
rnn_model = RnnModel()
@app.route('/ai/v1/TextEmotionAnalyse', methods=['POST'])
def textRNNClassPredict():
try:
start_time = time.time()
resParm = request.data
# 转字符串
resParm = str(resParm, encoding="utf-8")
resParm = eval(resParm)
requestId = resParm.get('requestId')
# 服务鉴权
token = resParm.get('token')
if not token:
res = {'code': 3, 'msg': 'token fail'}
logging.error("code: 3 msg: token fail ")
return json.dumps(res)
# req_json = request.get_json(silent=False)
# content = req_json.get('content')
strContent = resParm.get('inputStr')
if not strContent:
res = {'code': 4, 'msg': ' input string param invalid'}
logging.error("code: 4 msg: input string param invalid")
return json.dumps(res)
# 判定字符串长度
if len(strContent) > 600:
res = {'code': 5, 'msg': 'input string param length invalid'}
logging.error("code: 5 msg: input string param length invalid")
return json.dumps(res)
time_predict = time.time()
resFlag = rnn_model.predict(strContent)
logging.info(f"text analyse predict cost Time is: {str(time.time() - time_predict)} ")
if resFlag is None:
res = {'code': 2, 'msg': 'text analyse except, fail'}
logging.error("code: 2 msg: text analyse except, analyse fail")
return json.dumps(res)
print("\n\nresFlag: " + str(resFlag))
resEmotion = 0
positive_prob = resFlag[0][0]
negative_prob = resFlag[0][1]
if positive_prob >= 0.53: # 正向概率大于等于0.52 1 , 0, -1
resEmotion = 1
logging.info(f"情感为正向,概率为: {str(positive_prob)}")
if positive_prob <= 0.48: # 正向概率小于等于0.48
resEmotion = -1
logging.info(f"情感为负向,概率为: {str(negative_prob)}")
elif positive_prob > 0.48 and positive_prob < 0.53: # 正向概率大于0.48小于0.52
resEmotion = 0
logging.info(f"情感为中性,概率为: {str(max(negative_prob, positive_prob))}")
timeUsed = time.time() - start_time
data = {'requestId': requestId, 'emotionAnalyseRes': resEmotion, 'timeUsed': timeUsed}
res = {'code': 0, 'msg': 'success', 'data': data}
logging.info(f"code:0 msg:success text analyse cost Time is: {str(timeUsed)} ")
return json.dumps(res)
except Exception as e:
logging.exception(e)
res = {'code': 6, 'msg': 'request exception'}
return json.dumps(res)
if __name__ == "__main__":
logging.info('Starting the server...')
server.listen(("0.0.0.0", 8885))
server.run(app)
# app.run(host='0.0.0.0', port=8885, threaded=True)
# encoding: utf-8
from __future__ import print_function
import time
from datetime import timedelta
import os
import tensorflow as tf
import tensorflow.contrib.keras as kr
# from cnn_model import TCNNConfig, TextCNN
from rnn_model import TRNNConfig, TextRNN
from data.cnews_loader import read_category, read_vocab
import pandas as pd
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
try:
bool(type(unicode))
except NameError:
unicode = str
base_dir = 'data/cnews/'
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = 'checkpoints/testtextrnn'
save_path = os.path.join(save_dir, 'best_validation_0.8590335594711533') # 最佳验证结果保存路径
class RnnModel:
def __init__(self):
self.config = TRNNConfig()
self.categories, self.cat_to_id = read_category()
self.words, self.word_to_id = read_vocab(vocab_dir)
self.config.vocab_size = len(self.words)
self.model = TextRNN(self.config)
self.session = tf.Session()
self.session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=self.session, save_path=save_path) # 读取保存的模型
def predict(self, message):
# 支持不论在python2还是python3下训练的模型都可以在2或者3的环境下运行
content = unicode(message)
print(content)
data = [self.word_to_id[x] for x in content if x in self.word_to_id]
feed_dict = {
self.model.input_x: kr.preprocessing.sequence.pad_sequences([data], self.config.seq_length),
self.model.keep_prob: 1.0
}
y_pred_cls = self.session.run(self.model.predict, feed_dict=feed_dict)
return y_pred_cls
# return self.categories[y_pred_cls[0]]
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
if __name__ == '__main__':
# start_time = time.time()
rnn_model = RnnModel()
# testfilepath = 'data/cnews/textrnn_our_model_data.txt'
# df1 = pd.DataFrame(pd.read_csv(testfilepath,sep='\t', encoding='utf-8', error_bad_lines=False, header=None))
# df2 = df1.copy()
# df2.columns = ['label', 'content']
# df2['prelabel'] = df2['content'].apply(lambda x: rnn_model.predict(x))
# print("终于执行完了")
# df3 = df2[['label','prelabel', 'content']]
# df3.to_csv('data/cnews/textrnn_our_model_5000_result.txt',index=False)
# time_dif = get_time_dif(start_time)
# print("Time usage:", time_dif)
test_demo = ['真的佩服自己能把这部电影看完',
'这个电影太一般',"这个电影一般","这个电影真垃圾","这个电影真好看"]
for i in test_demo:
flag = rnn_model.predict(i)
positive_prob = flag[0][0]
negative_prob = flag[0][1]
if positive_prob >= 0.53:#正向概率大于等于0.52
print("情感为正向,概率为:"+str(positive_prob))
if positive_prob <= 0.48:#正向概率小于等于0.48
print("情感为负向,概率为:"+str(negative_prob))
elif positive_prob >= 0.48 and positive_prob <= 0.53:#正向概率大于0.48小于0.52
print("情感为中性,概率为:"+str(max(negative_prob,positive_prob)))
# print(type(flag))
# if int(flag) == 2:
# print("情感为正向")
# if int(flag) == 0:
# print("情感为负向")
rnn_model.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import tensorflow as tf
class TRNNConfig(object):
"""RNN配置参数"""
# 模型参数
embedding_dim = 128 # 词向量维度
seq_length = 600 # 序列长度
num_classes = 2 # 类别数
vocab_size = 20000 # 词汇表达小
num_layers= 2 # 隐藏层层数
hidden_dim = 128 # 隐藏层神经元
rnn = 'gru' # lstm 或 gru
dropout_keep_prob = 0.8 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 128 # 每批训练大小
num_epochs = 10 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
save_per_batch = 10 # 每多少轮存入tensorboard
class TextRNN(object):
"""文本分类,RNN模型"""
def __init__(self, config):
self.config = config
# 三个待输入的数据
self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.rnn()
def rnn(self):
"""rnn模型"""
def lstm_cell(): # lstm核
return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
def gru_cell(): # gru核
return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
def dropout(): # 为每一个rnn核后面加一个dropout层
if (self.config.rnn == 'lstm'):
cell = lstm_cell()
else:
cell = gru_cell()
return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
# 词向量映射
with tf.device('/gpu:0'):
embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope("rnn"):
# 多层rnn网络
cells = [dropout() for _ in range(self.config.num_layers)]
rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
_outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
last = _outputs[:, -1, :] # 取最后一个时序输出作为结果
with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活
fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
# 分类器
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别
with tf.name_scope("optimize"):
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
# 优化器
self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))