文本分词处理+机器学习模型

1. 目标

情感分类

2. 数据来源

12000条美团外卖平台收集的用户评价(正向4000 条,负向约 8000 条)

https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/waimai_10k

3. 数据预处理

(1)读入数据+拆分训练集和测试集

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,roc_curve,auc,f1_score,accuracy_score,precision_recall_curve,classification_report,average_precision_score
import jieba
import jieba.posseg as pseg
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from collections import Counter
from wordcloud import WordCloud
plt.rcParams['font.family'] = 'Arial Unicode MS'
plt.rcParams['axes.unicode_minus']=False
#读入数据
data = pd.read_csv('/Users/harper/Desktop/waimai_10k.csv')
#把数据分成训练集和测试集,拆分比例为8:2
x_train,x_test,y_train,y_test = train_test_split(data.review,data.label,train_size=0.8,stratify=data.label,random_state=123)

(2)分词

a. 导入主张、否定、停用和程度词库

#主张词库
zzword=np.load('/Users/harper/Desktop/python/drop_word/regard.npy').tolist()
#停用词库
tyword=np.load('/Users/harper/Desktop/python/drop_word/stop.npy').tolist()
#否定词库
fdword=np.load('/Users/harper/Desktop/python/drop_word/denial.npy').tolist()
#程度词库
cdword=np.load('/Users/harper/Desktop/python/drop_word/degree.npy',allow_pickle=True).item().keys()

#绘制四个词库的词云图
path = '/System/Library/Fonts/Songti.ttc'
img = plt.imread('/Users/harper/Desktop/python/background.jpg')
def wordcloud(x):
    wordcloud = WordCloud(mask = img,font_path=path,
                      background_color='white').generate(','.join(x) )
    return wordcloud

plt.subplot(2,2,1)
plt.imshow(wordcloud(zzword))
plt.axis('off')
plt.title('主张词')
plt.subplot(2,2,2)
plt.imshow(wordcloud(tyword))
plt.axis('off')
plt.title('停用词')
plt.subplot(2,2,3)
plt.imshow(wordcloud(fdword))
plt.axis('off')
plt.title('否定词')
plt.subplot(2,2,4)
plt.imshow(wordcloud(cdword))
plt.axis('off')
plt.title('程度词')
plt.show()
plt.savefig('/Users/harper/Desktop/2.png',dpi=500,bbox_inches = 'tight')

b.  对训练集和测试集进行分词

not_drop = ['n','v','vd','vn','a','ad','an','d']#保留名词、动词、形容词等词
drop_word = zzword+tyword+fdword+list(cdword)#删除主张、否定、程度和停用词库中的词
def splitwords(df):
    df = pd.DataFrame(df)
    for i in df.index:
        w = []
        words = pseg.cut(df.review[i],use_paddle = True)
        for word,flag in words:
            if word not in drop_word:
                if flag in not_drop:
                    w.append(word)
        df.loc[i,'result'] = ' '.join(w)
    return df
x_train = splitwords(x_train)
x_test = splitwords(x_test)

c. 词向量化:计算TF-IDF

tdidf = TfidfVectorizer()#建立TF-IDF模型
X_train = tdidf.fit_transform(x_train['result'])#用训练集数据拟合TF-IDF模型
print(X_train.toarray().shape)

此时训练集特征维度为9589*5315,我们建立Lasso-逻辑回归模型进行特征选择。

d. 特征选择

此时训练集特征维度为9589*186。

#Lasso特征选择
select_lasso = SelectFromModel(LogisticRegression(penalty = 'l1',C = 0.5,solver = 'liblinear'))#建立Lasso-罗辑回归模型
xx_train = select_lasso.fit_transform(X_train,y_train).toarray()#拟合模型并转换为矩阵格式
words = tdidf.get_feature_names()
feature = pd.DataFrame(words).loc[select_lasso.get_support().tolist()]
#训练集最终分词结果词云图
l = ' '.join(x_train['result']).split(' ')#训练集分词结果
word_counts = Counter(l)#计数
word_counts_df = pd.DataFrame(data = {"Word":list(word_counts.keys()),"Freq":list(word_counts.values())})#统计词频
freq = word_counts_df.loc[word_counts_df['Word'].map(lambda x: x in feature.values)]#获取lasso-逻辑回归选中的特征
count = freq.sort_values(by=['Freq'],ascending=False)
name = count.iloc[:,0]
value = count.iloc[:,1]
dic = dict(zip(name,value))
plt.imshow(WordCloud(mask = img,font_path = path,background_color='white').generate_from_frequencies(dic))
plt.axis('off')
plt.savefig('/Users/harper/Desktop/4.png',dpi=500,bbox_inches = 'tight')
plt.show()

e. 对测试集计算TF-IDF,并进行特征选择

x_test = tdidf.transform(x_test['result'])
xx_test = select_lasso.transform(x_test).toarray()

4. 分类模型

(1)贝叶斯分类器

a. 高斯似然

#朴素贝叶斯模型
from sklearn.naive_bayes import GaussianNB
SEED=123456 # 后继建模的随机数种子
model_NB = GaussianNB() # 建立朴素贝叶斯模型,使用高斯似然
t = time() # 开始拟合时间
model_NB.fit(xx_train,y_train) # 拟合模型
print('时间:',round(time()-t,4)) # 拟合模型运行时间
print('GaussianNB模型结果评价-训练集:\n',classification_report(y_train,model_NB.predict(xx_train),digits=4))
print('GaussianNB模型结果评价-测试集:\n',classification_report(y_test,model_NB.predict(xx_test),digits=4))

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
时间: 0.1187
GaussianNB模型结果评价-训练集:
               precision    recall  f1-score   support
           0     0.9379    0.4987    0.6511      6389
           1     0.4827    0.9341    0.6365      3200
    accuracy                         0.6440      9589
   
  • 2
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

远胥

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值