1. 目标
情感分类
2. 数据来源
12000条美团外卖平台收集的用户评价(正向4000 条,负向约 8000 条)
https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/waimai_10k
3. 数据预处理
(1)读入数据+拆分训练集和测试集
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,roc_curve,auc,f1_score,accuracy_score,precision_recall_curve,classification_report,average_precision_score
import jieba
import jieba.posseg as pseg
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from collections import Counter
from wordcloud import WordCloud
plt.rcParams['font.family'] = 'Arial Unicode MS'
plt.rcParams['axes.unicode_minus']=False
#读入数据
data = pd.read_csv('/Users/harper/Desktop/waimai_10k.csv')
#把数据分成训练集和测试集,拆分比例为8:2
x_train,x_test,y_train,y_test = train_test_split(data.review,data.label,train_size=0.8,stratify=data.label,random_state=123)
(2)分词
a. 导入主张、否定、停用和程度词库
#主张词库
zzword=np.load('/Users/harper/Desktop/python/drop_word/regard.npy').tolist()
#停用词库
tyword=np.load('/Users/harper/Desktop/python/drop_word/stop.npy').tolist()
#否定词库
fdword=np.load('/Users/harper/Desktop/python/drop_word/denial.npy').tolist()
#程度词库
cdword=np.load('/Users/harper/Desktop/python/drop_word/degree.npy',allow_pickle=True).item().keys()
#绘制四个词库的词云图
path = '/System/Library/Fonts/Songti.ttc'
img = plt.imread('/Users/harper/Desktop/python/background.jpg')
def wordcloud(x):
wordcloud = WordCloud(mask = img,font_path=path,
background_color='white').generate(','.join(x) )
return wordcloud
plt.subplot(2,2,1)
plt.imshow(wordcloud(zzword))
plt.axis('off')
plt.title('主张词')
plt.subplot(2,2,2)
plt.imshow(wordcloud(tyword))
plt.axis('off')
plt.title('停用词')
plt.subplot(2,2,3)
plt.imshow(wordcloud(fdword))
plt.axis('off')
plt.title('否定词')
plt.subplot(2,2,4)
plt.imshow(wordcloud(cdword))
plt.axis('off')
plt.title('程度词')
plt.show()
plt.savefig('/Users/harper/Desktop/2.png',dpi=500,bbox_inches = 'tight')
b. 对训练集和测试集进行分词
not_drop = ['n','v','vd','vn','a','ad','an','d']#保留名词、动词、形容词等词
drop_word = zzword+tyword+fdword+list(cdword)#删除主张、否定、程度和停用词库中的词
def splitwords(df):
df = pd.DataFrame(df)
for i in df.index:
w = []
words = pseg.cut(df.review[i],use_paddle = True)
for word,flag in words:
if word not in drop_word:
if flag in not_drop:
w.append(word)
df.loc[i,'result'] = ' '.join(w)
return df
x_train = splitwords(x_train)
x_test = splitwords(x_test)
c. 词向量化:计算TF-IDF
tdidf = TfidfVectorizer()#建立TF-IDF模型
X_train = tdidf.fit_transform(x_train['result'])#用训练集数据拟合TF-IDF模型
print(X_train.toarray().shape)
此时训练集特征维度为9589*5315,我们建立Lasso-逻辑回归模型进行特征选择。
d. 特征选择
此时训练集特征维度为9589*186。
#Lasso特征选择
select_lasso = SelectFromModel(LogisticRegression(penalty = 'l1',C = 0.5,solver = 'liblinear'))#建立Lasso-罗辑回归模型
xx_train = select_lasso.fit_transform(X_train,y_train).toarray()#拟合模型并转换为矩阵格式
words = tdidf.get_feature_names()
feature = pd.DataFrame(words).loc[select_lasso.get_support().tolist()]
#训练集最终分词结果词云图
l = ' '.join(x_train['result']).split(' ')#训练集分词结果
word_counts = Counter(l)#计数
word_counts_df = pd.DataFrame(data = {"Word":list(word_counts.keys()),"Freq":list(word_counts.values())})#统计词频
freq = word_counts_df.loc[word_counts_df['Word'].map(lambda x: x in feature.values)]#获取lasso-逻辑回归选中的特征
count = freq.sort_values(by=['Freq'],ascending=False)
name = count.iloc[:,0]
value = count.iloc[:,1]
dic = dict(zip(name,value))
plt.imshow(WordCloud(mask = img,font_path = path,background_color='white').generate_from_frequencies(dic))
plt.axis('off')
plt.savefig('/Users/harper/Desktop/4.png',dpi=500,bbox_inches = 'tight')
plt.show()
e. 对测试集计算TF-IDF,并进行特征选择
x_test = tdidf.transform(x_test['result'])
xx_test = select_lasso.transform(x_test).toarray()
4. 分类模型
(1)贝叶斯分类器
a. 高斯似然
#朴素贝叶斯模型
from sklearn.naive_bayes import GaussianNB
SEED=123456 # 后继建模的随机数种子
model_NB = GaussianNB() # 建立朴素贝叶斯模型,使用高斯似然
t = time() # 开始拟合时间
model_NB.fit(xx_train,y_train) # 拟合模型
print('时间:',round(time()-t,4)) # 拟合模型运行时间
print('GaussianNB模型结果评价-训练集:\n',classification_report(y_train,model_NB.predict(xx_train),digits=4))
print('GaussianNB模型结果评价-测试集:\n',classification_report(y_test,model_NB.predict(xx_test),digits=4))
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
时间: 0.1187
GaussianNB模型结果评价-训练集:
precision recall f1-score support
0 0.9379 0.4987 0.6511 6389
1 0.4827 0.9341 0.6365 3200
accuracy 0.6440 9589