今天帮助了以为外国人做了一下keras版本的中文情感分析,我这里没有去掉标点符号,停用词这些,有兴趣的读者可以自己实现。我这里把代码分享出来。数据集下载地址为:http://file.hankcs.com/corpus/ChnSentiCorp.zip
环境
keras 2.2.4 ubuntu 16.04 python 3.6 tensorflow-gpu 2.2.4
pip install kashgari
代码
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.classification import CNNLSTMModel
from kashgari.corpus import SMP2017ECDTClassificationCorpus
import numpy as np
from nltk.tokenize import RegexpTokenizer
import string
import os
import pandas as pd
import jieba
root_path='./dataset/chnsenticorp'
train_path=os.path.join(root_path,"train.tsv")
train_data=pd.read_csv(train_path,sep='\t')
print(train_data.head())
def tokenlize_(string):
seg_list=jieba.cut(string, cut_all=True)
return list(seg_list)
train_data['text_a']=train_data['text_a'].apply(lambda x:tokenlize_(x))
print(train_data.head())
train_data_x = np.array(train_data['text_a'])
train_x=train_data_x.tolist()
print(train_x[0][0])
# Y = pd.get_dummies(train_data['label'].astype(int))
# print(Y.head())
# train_data['label'] = train_data['label'].astype('int32').map({1:"positive" , 0:"negative"})
# print(train_data[0])
# for row in train_data['label'].rows:
# print(row)
# break
train_data_y = np.array(train_data['label'])
train_y=train_data_y.tolist()
print(train_y[0])
validation=pd.read_csv(os.path.join(root_path,"test.tsv"),sep='\t')
validation['text_a']=validation['text_a'].apply(lambda x:tokenlize_(x))
test_data = np.array(validation['text_a'])
test_x=test_data.tolist()
test_data_y = np.array(validation['label'])
test_y=test_data_y.tolist()
print(test_y[0])
bert_embedding = BERTEmbedding('bert-base-chinese', sequence_length=30)
model = CNNLSTMModel(bert_embedding)
# train_x, train_y = SMP2017ECDTClassificationCorpus.get_classification_data()
model.fit(train_x, train_y, epochs=10,validation_data=(test_x,test_y))
scores=model.evaluate(test_x,test_y)
# print("Accuracy:%.2f%%"%(scores[1]*100))
参考文献
[1].Kashgari. https://github.com/BrikerMan/Kashgari