利用 keras 进行点评评论情感分析

利用 keras 进行点评评论情感分析

一、准备工作

1、数据导入

import pandas as pd
data = pd.read_csv('/Users/liming/Downloads/review.csv')
print(data.shape)
data.head()
(100000, 3)
reviewidreviewbodystar
0661913194宝燕乐园的滑滑梯很出名啊,波浪行的,陡峭型的,管道式的,小朋友一个滑滑梯滑过来,乐此不疲。园...50
1661913413挺不错的潮汕味道,老家的味道50
2661913671心心念念几个月,今天终于成行啦,四个人早早开车前往田林路,最近在修路所以车只能停在地下室,(...50
3661914092干净卫生,老板服务热情,很温馨的汗蒸房??50
4661916265慕名而去的,88元12个的大生蚝吃完感觉特别值,出菜员还告诉我们给我们多加了两个。感觉在广东...45

2、情感划分

我们的目的是分析文本的情感:积极或消极。
因此,这里设置阈值为30:star 小于30的为消极(0)、大于等于30的为积极(1)。

# 定义函数:根据用户评的星级来估计sentiment(情感)
def make_label(star):
    if star >=30:
        return 1
    else:
        return 0
# 运用 apply 方法得到新列
data["sentiment"] = data.star.apply(make_label)
data.head()
reviewidreviewbodystarsentiment
0661913194宝燕乐园的滑滑梯很出名啊,波浪行的,陡峭型的,管道式的,小朋友一个滑滑梯滑过来,乐此不疲。园...501
1661913413挺不错的潮汕味道,老家的味道501
2661913671心心念念几个月,今天终于成行啦,四个人早早开车前往田林路,最近在修路所以车只能停在地下室,(...501
3661914092干净卫生,老板服务热情,很温馨的汗蒸房??501
4661916265慕名而去的,88元12个的大生蚝吃完感觉特别值,出菜员还告诉我们给我们多加了两个。感觉在广东...451
sentences = data["reviewbody"].astype(str)
labels = data["sentiment"]

3、分词并转化为词向量

import jieba
from gensim.models.word2vec import Word2Vec
def train_word2vec(sentences,save_path):
    sentences_seg = []
    sen_str = "\n".join(sentences)
    res = jieba.lcut(sen_str)
    seg_str = " ".join(res)
    sen_list = seg_str.split("\n")
    for i in sen_list:
        sentences_seg.append(i.split())
    print("开始训练词向量") 
#     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(sentences_seg,
                size=100,  # 词向量维度
                min_count=5,  # 词频阈值
                window=5)  # 窗口大小    
    model.save(save_path)
    return model

model =  train_word2vec(sentences,'/Users/liming/Downloads/word2vec.model')    
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/zd/qhg48cw17_ncqf0rl48wz5rh0000gp/T/jieba.cache
Loading model cost 0.662 seconds.
Prefix dict has been built successfully.


开始训练词向量

4、数据预处理

from gensim.corpora.dictionary import Dictionary
from gensim import models
import numpy as np

def generate_id2wec(word2vec_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
    w2id = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
    w2vec = {word: model[word] for word in w2id.keys()}  # 词语的词向量
    n_vocabs = len(w2id) + 1
    embedding_weights = np.zeros((n_vocabs, 100))
    for w, index in w2id.items():  # 从索引为1的词语开始,用词向量填充矩阵
        embedding_weights[index, :] = w2vec[w]
    return w2id,embedding_weights

def text_to_array(w2index, senlist):  # 文本转为索引数字模式
    sentences_array = []
    for sen in senlist:
        new_sen = [ w2index.get(word,0) for word in sen]   # 单词转索引数字
        sentences_array.append(new_sen)
    return np.array(sentences_array)

def prepare_data(w2id,sentences,labels,max_len=200):
    X_train, X_val, y_train, y_val = train_test_split(sentences,labels, test_size=0.2)
    X_train = text_to_array(w2id, X_train)
    X_val = text_to_array(w2id, X_val)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_val = pad_sequences(X_val, maxlen=max_len)
    return np.array(X_train), np_utils.to_categorical(y_train) ,np.array(X_val), np_utils.to_categorical(y_val)
w2id, embedding_weights = generate_id2wec(model)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  if __name__ == '__main__':
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

x_train, y_trian, x_val , y_val = prepare_data(w2id, sentences, labels,200)

二、构建模型

class Sentiment:
    def __init__(self,w2id,embedding_weights,Embedding_dim,maxlen,labels_category):
        self.Embedding_dim = Embedding_dim
        self.embedding_weights = embedding_weights
        self.vocab = w2id
        self.labels_category = labels_category
        self.maxlen = maxlen
        self.model = self.build_model()
      
        
    def build_model(self):
        model = Sequential()
        #input dim(140,100)
        model.add(Embedding(output_dim = self.Embedding_dim,
                           input_dim=len(self.vocab)+1,
                           weights=[self.embedding_weights],
                           input_length=self.maxlen))
        model.add(Bidirectional(LSTM(50),merge_mode='concat'))
        model.add(Dropout(0.5))
        model.add(Dense(self.labels_category))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                     optimizer='adam', 
                     metrics=['accuracy'])
        model.summary()
        return model
    
    def train(self,X_train, y_train,X_test, y_test,n_epoch=5 ):
        self.model.fit(X_train, y_train, batch_size=32, epochs=n_epoch,
                      validation_data=(X_test, y_test))
        self.model.save('sentiment.h5')   
        
    def predict(self,model_path,new_sen):
        model = self.model
        model.load_weights(model_path)
        new_sen_list = jieba.lcut(new_sen)
        sen2id =[ self.vocab.get(word,0) for word in new_sen_list]
        sen_input = pad_sequences([sen2id], maxlen=self.maxlen)
        res = model.predict(sen_input)[0]
        return np.argmax(res)
from keras import Sequential
from keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout,Activation,Softmax

senti = Sentiment(w2id,embedding_weights,100,200,2)
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 200, 100)          2885200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
=================================================================
Total params: 2,945,802
Trainable params: 2,945,802
Non-trainable params: 0
_________________________________________________________________
senti.train(x_train,y_trian, x_val ,y_val,1)
/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 80000 samples, validate on 20000 samples
Epoch 1/1
80000/80000 [==============================] - 1776s 22ms/step - loss: 0.1473 - accuracy: 0.9518 - val_loss: 0.1270 - val_accuracy: 0.9569
label_dic = {0:"消极的",1:"积极的"}
sen_new = "这家的银耳莲子羹很不错,上菜很快,菜的照片很真实"
pre = senti.predict("./sentiment.h5",sen_new)
print("'{}'的情感是:\n{}".format(sen_new,label_dic.get(pre)))
'这家的银耳莲子羹很不错,上菜很快,菜的照片很真实'的情感是:
积极的
  • 1
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值