lstm训练情感分析的优点_LSTM 情感分析 (Keras 版本)

所需环境:Python3.6 + Tensorflow

还需要安装 Keras:

pip install keras -i https://pypi.tuna.tsinghua.edu.cn/simple/

所需数据集 :

基于LSTM网络,结构图如下:

image.png

代码解释

导入相关模块

import pandas as pd

import numpy as np

import jieba

from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers.core import Dense, Dropout, Activation

from keras.layers.embeddings import Embedding

from keras.layers.recurrent import LSTM

from sklearn.utils import shuffle

from keras.utils import plot_model

读取数据集,分别存储在 DataFream 里

neg = pd.read_excel('neg.xls',header=None,index=None) #负面情绪数据集

pos = pd.read_excel('pos.xls',header=None,index=None) #正面情绪数据集

数据集打标签

pos['mark'] = 1 #正面情绪

neg['mark'] = 0 #负面情绪

合并数据集

df = pd.concat([pos,neg],ignore_index=True)

对数据集进行分词

df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分词

如图:

image.png

统计分词后每个次出现的次数(主要是去重)

df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())

对每个词进行编号:

df_words['id'] = list(range(1,len(df_words)+1)) #id编号是顺序递增的

image.png

把每个句子中文转成句子向量(使用简单编号向量)

df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))

image.png

把句子向量的长度统一到50,长度不够补0

df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))

image.png

数据整理完成,获取训练集和测试集

x_train = np.array(list(df['words_vecoter']))[::2] #训练集

y_train = np.array(list(df['mark']))[::2]

x_test = np.array(list(df['words_vecoter']))[1::2] #测试集

y_test = np.array(list(df['mark']))[1::2]

随机打乱数据集

x_train,y_train = shuffle(x_train,y_train)

x_test,y_test = shuffle(x_test,y_test)

构造循环网络

dlen = len(df_words) + 1

model = Sequential()

model.add(Embedding(dlen, 256)) #Embedding层就是以one hot为输入、中间层节点为字向量维数的全连接层!而这个全连接层的参数,就是一个“字向量表”!

model.add(LSTM(128))

model.add(Dropout(0.5))

model.add(Dense(1))

model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存网络结构

model.summary() #显示网络结构

训练模型

model.fit(x_train, y_train, batch_size=16, nb_epoch=5)

评估预测

y_predict = model.predict(x_test)

print(y_predict)

acc = model.evaluate(x_test, y_test)

print('Test accuracy:', acc)

保存模型

model.save('sentiment_nalysis.h5')

完整代码

import pandas as pd

import numpy as np

import jieba

from keras.preprocessing import sequence

from keras.models import Sequential

from keras.layers.core import Dense, Dropout, Activation

from keras.layers.embeddings import Embedding

from keras.layers.recurrent import LSTM

from sklearn.utils import shuffle

from keras.utils import plot_model

neg = pd.read_excel('neg.xls',header=None,index=None) #负面情绪数据集

pos = pd.read_excel('pos.xls',header=None,index=None) #正面情绪数据集

pos['mark'] = 1 #正面情绪

neg['mark'] = 0 #负面情绪

df = pd.concat([pos,neg],ignore_index=True)

df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分词

df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())

df_words['id'] = list(range(1,len(df_words)+1)) #id编号是顺序递增的

df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))

df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))

x_train = np.array(list(df['words_vecoter']))[::2] #训练集

y_train = np.array(list(df['mark']))[::2]

x_test = np.array(list(df['words_vecoter']))[1::2] #测试集

y_test = np.array(list(df['mark']))[1::2]

x_train,y_train = shuffle(x_train,y_train)

x_test,y_test = shuffle(x_test,y_test)

dlen = len(df_words) + 1

model = Sequential()

model.add(Embedding(dlen, 256))

model.add(LSTM(128))

model.add(Dropout(0.5))

model.add(Dense(1))

model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存网络结构

model.summary() #显示网络结构

model.fit(x_train, y_train, batch_size=16, nb_epoch=5)

y_predict = model.predict(x_test)

print(y_predict)

acc = model.evaluate(x_test, y_test)

print('Test accuracy:', acc)

model.save('sentiment_nalysis.h5')

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值