文本情感分类

import jieba
import pandas as pd
import csv
import time
import warnings
warnings.filterwarnings('ignore')
def stopwordslist():
    with open('HGD_StopWords.txt',encoding='UTF-8') as f:
        stopwords = [line.strip() for line in f]
    return stopwords
def depart_sentence(sentence):
    stopwords = stopwordslist()
    words = jieba.lcut(sentence.strip())
    filtered_words = [word for word in words if word not in stopwords and word != '\t']
    return ' '.join(filtered_words)
with open('ALL_Comment.txt','r',encoding='UTF-8') as infile:
    with open('after_stop_words.txt','w',encoding='UTF-8') as outfile:
        for line in infile:
            outfile.writelines(depart_sentence(line)+'\n')

print("删除停用词和分词成功!!!")
data = pd.DataFrame()
with open('after_stop_words.txt',encoding='UTF-8') as f:
    data['评论'] = f.readlines()
with open('All_label.txt',encoding='UTF-8') as f:
    all_labels=[]
    for line in f:
        all_labels.extend(line.split(','))
    data['评分'] = all_labels
data.head(5)

import numpy as np
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
x_train,x_test,y_train,y_test = model_selection.train_test_split(data.评论.values.astype('U'),data.评分.values,test_size=0.1,random_state=1)
TF_Vec = TfidfVectorizer(max_df=0.8,min_df=3)
x_train_tfvec = TF_Vec.fit_transform(x_train)
x_test_tfvec = TF_Vec.transform(x_test)

CT_Vec = CountVectorizer(max_df=0.8,min_df=3)
x_train_ctvec = CT_Vec.fit_transform(x_train)
x_test_ctvec = CT_Vec.transform(x_test)
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
#使用TF_IDF转化的向量作为数据特征传入模型
lr = linear_model.LogisticRegression()
model = GridSearchCV(lr,cv=3,param_grid={'C':np.logspace(0,4,30),'penalty':['l1','l2']})
model.fit(x_train_tfvec,y_train)

y_train_predict=model.predict(x_train_tfvec)
train_accuracy=accuracy_score(y_train_predict,y_train)

y_test_predict=model.predict(x_test_tfvec)
test_accuracy = accuracy_score(y_test_predict,y_test)

print('最优参数:', model.best_params_)
print('使用TF-IDF提取特征使用逻辑回归,模型自适应参数,\n训练集:{0}\n测试集:{1}'.format(train_accuracy,test_accuracy))

#使用CountVectorizer
lr = linear_model.LogisticRegression()
model = GridSearchCV(lr,cv=3,param_grid={'C':np.logspace(0,4,30),'penalty':['l1','l2']})
model.fit(x_train_ctvec,y_train)

y_train_predict = model.predict(x_train_ctvec)
train_accuracy = accuracy_score(y_train_predict,y_train)

y_test_predict = model.predict(x_test_ctvec)
test_accuracy = accuracy_score(y_test_predict,y_test)

print('最优参数:',model.best_params_)
print('使用CountVectorizer提取特征使用逻辑回归,模型自适应参数\n训练集:{0}\n测试集:{1}'.format(train_accuracy,test_accuracy))

#KNN
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(x_train_tfvec,y_train)
train_accuracy = accuracy_score(y_train,model.predict(x_train_tfvec))
test_accuracy = accuracy_score(y_test,model.predict(x_test_tfvec))

print('使用KNN\n训练集:{0}\n测试集:{1}'.format(train_accuracy,test_accuracy))

#RF
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train_tfvec,y_train)
train_accuracy = accuracy_score(y_train,model.predict(x_train_tfvec))
test_accuracy = accuracy_score(y_test,model.predict(x_test_tfvec))

print('使用随机森林\n训练集:{0}\n测试集:{1}'.format(train_accuracy,test_accuracy))

 

#MultinomialNB
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_tfvec,y_train)
train_accuracy = accuracy_score(y_train,model.predict(x_train_tfvec))
test_accuracy = accuracy_score(y_test,model.predict(x_test_tfvec))

print('使用多项式贝叶斯:\n训练集:{0}\n测试集:{1}'.format(train_accuracy,test_accuracy))

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值