Sanders Analytics Twitter Sentiment Corpus是一个推特情感分析的数据集,用bert 模型训练的效果还不错,好像达到了92.6%,这里把代码分享出来。数据集可从这个网址下载:http://www.keenformatics.com/articles/2015-07/sentiment-analysis-lexicons-and-datasets#sanders
环境
ubuntu 16.04, keras 2.2, tensorflow-gpu, Kashgari
代码
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.classification import CNNLSTMModel
from kashgari.corpus import SMP2017ECDTClassificationCorpus
import numpy as np
from nltk.tokenize import RegexpTokenizer
import string
import os
import pandas as pd
root_path='./dataset/Sander_TwitterSentiment_Full'
full_dataset = pd.read_csv(os.path.join(root_path,"full_corpus_binary.csv"))
full_dataset.columns = ['label','text']
print(full_dataset.head())
def tokenize (text):
tokens = [word.strip(string.punctuation) for word in RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{1,50}\b').tokenize(text)]
return tokens
full_dataset['text']=full_dataset['text'].apply(lambda x:tokenize(x))
print(full_dataset.head())
train_data = np.array(full_dataset['text'])#np.ndarray()
train_x=train_data.tolist()
train_data_y = np.array(full_dataset['label'])#np.ndarray()
train_y=train_data_y.tolist()
validation=pd.read_csv(os.path.join(root_path,"test-corpus_binary.csv"))
validation.columns = ['label','text']
validation['text']=validation['text'].apply(lambda x:tokenize(x))
test_data = np.array(validation['text'])#np.ndarray()
test_x=test_data.tolist()
test_data_y = np.array(validation['label'])#np.ndarray()
test_y=test_data_y.tolist()
# train_x=full_dataset['text'].values
# train_y=full_dataset['label'].values
bert_embedding = BERTEmbedding('bert-base-uncased', sequence_length=30)
model = CNNLSTMModel(bert_embedding)
# train_x, train_y = SMP2017ECDTClassificationCorpus.get_classification_data()
model.fit(train_x, train_y, epochs=10,validation_data=(test_x,test_y))
scores=model.evaluate(test_x,test_y)
print("Accuracy:%.2f%%"%(scores[1]*100))
参考文献
[1]. Kashgari. https://github.com/BrikerMan/Kashgari