import jieba
from gensim.models.word2vec import Word2Vec
deftrain_word2vec(sentences,save_path):
sentences_seg =[]
sen_str ="\n".join(sentences)
res = jieba.lcut(sen_str)
seg_str =" ".join(res)
sen_list = seg_str.split("\n")for i in sen_list:
sentences_seg.append(i.split())print("开始训练词向量")# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(sentences_seg,
size=100,# 词向量维度
min_count=5,# 词频阈值
window=5)# 窗口大小
model.save(save_path)return model
model = train_word2vec(sentences,'/Users/liming/Downloads/word2vec.model')
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/zd/qhg48cw17_ncqf0rl48wz5rh0000gp/T/jieba.cache
Loading model cost 0.662 seconds.
Prefix dict has been built successfully.
开始训练词向量
4、数据预处理
from gensim.corpora.dictionary import Dictionary
from gensim import models
import numpy as np
defgenerate_id2wec(word2vec_model):
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
w2id ={v: k +1for k, v in gensim_dict.items()}# 词语的索引,从1开始编号
w2vec ={word: model[word]for word in w2id.keys()}# 词语的词向量
n_vocabs =len(w2id)+1
embedding_weights = np.zeros((n_vocabs,100))for w, index in w2id.items():# 从索引为1的词语开始,用词向量填充矩阵
embedding_weights[index,:]= w2vec[w]return w2id,embedding_weights
deftext_to_array(w2index, senlist):# 文本转为索引数字模式
sentences_array =[]for sen in senlist:
new_sen =[ w2index.get(word,0)for word in sen]# 单词转索引数字
sentences_array.append(new_sen)return np.array(sentences_array)defprepare_data(w2id,sentences,labels,max_len=200):
X_train, X_val, y_train, y_val = train_test_split(sentences,labels, test_size=0.2)
X_train = text_to_array(w2id, X_train)
X_val = text_to_array(w2id, X_val)
X_train = pad_sequences(X_train, maxlen=max_len)
X_val = pad_sequences(X_val, maxlen=max_len)return np.array(X_train), np_utils.to_categorical(y_train),np.array(X_val), np_utils.to_categorical(y_val)
w2id, embedding_weights = generate_id2wec(model)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
if __name__ == '__main__':
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
x_train, y_trian, x_val , y_val = prepare_data(w2id, sentences, labels,200)
from keras import Sequential
from keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout,Activation,Softmax
senti = Sentiment(w2id,embedding_weights,100,200,2)
/opt/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
"Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Train on 80000 samples, validate on 20000 samples
Epoch 1/1
80000/80000 [==============================] - 1776s 22ms/step - loss: 0.1473 - accuracy: 0.9518 - val_loss: 0.1270 - val_accuracy: 0.9569
label_dic ={0:"消极的",1:"积极的"}
sen_new ="这家的银耳莲子羹很不错,上菜很快,菜的照片很真实"
pre = senti.predict("./sentiment.h5",sen_new)print("'{}'的情感是:\n{}".format(sen_new,label_dic.get(pre)))
利用 keras 进行点评评论情感分析一、准备工作1、数据导入import pandas as pddata = pd.read_csv('/Users/liming/Downloads/review.csv')print(data.shape)data.head()(100000, 3) reviewid reviewbody star 0 661913194