python文本情感分析 逻辑回归_python实现随机森林、逻辑回归和朴素贝叶斯的新闻文本分类...

实现本文的文本数据可以在THUCTC下载也可以自己手动爬虫生成,

本文主要参考:https://blog.csdn.net/hao5335156/article/details/82716923

nb表示朴素贝叶斯

rf表示随机森林

lg表示逻辑回归

初学者(我)通过本程序的学习可以巩固python基础,学会python文本的处理,和分类器的调用。方便接下来的机器学习的学习。

各个参数直观的含义:

1532687-20181203165350550-650344148.png

# -*- coding: utf-8 -*-

"""

Created on Thu Nov 29 13:00:46 2018

@author: caoqu

"""

import matplotlib.pyplot as plt

import random

import os

import jieba

from sklearn.naive_bayes import MultinomialNB as NB

from sklearn.linear_model.logistic import LogisticRegression as LR

from sklearn.ensemble import RandomForestClassifier as RF

# 文本处理 --> 生成训练集 测试集 词频集

def text_processor(text_path, test_size=0.2):

folder_list = os.listdir(text_path)

data_list=[] # 每个元素均为一篇文章

class_list=[] # 对应于每篇文章的类别

# 一个循环读取一个类别的文件夹

for folder in folder_list:

new_folder_path = os.path.join(text_path, folder) # 类别列表

# 由于THUCTC文本巨多,所以我从每个类别的文本列表中随机抽取200个文本用于训练和测试,可以自行修改

files = random.sample(os.listdir(new_folder_path), 200)

# 一个循环读取一篇文章

for file in files:

with open(os.path.join(new_folder_path, file), 'r', encoding='UTF-8') as fp:

raw = fp.read()

word_cut = jieba.cut(raw, cut_all=False) #精确模式切分文章

word_list = list(word_cut) # 一篇文章一个 word_list

data_list.append(word_list)

class_list.append(folder.encode('utf-8'))

# 划分训练集和测试集

# data_class_list[[word_list_one[], 体育], [word_list_two[], 财经], ..., [...]]

data_class_list = list(zip(data_list, class_list))

random.shuffle(data_class_list) # 打乱顺序

index = int(len(data_class_list) * test_size) + 1 # 训测比为 8:2

train_list = data_class_list[index:]

test_list = data_class_list[:index]

train_data_list, train_class_list = zip(*train_list) # (word_list_one[],...), (体育,...)

test_data_list, test_class_list = zip(*test_list)

# 统计词频 all_words_dict{"key_word_one":100, "key_word_two":200, ...}

all_words_dict = {}

for word_list in train_data_list:

for word in word_list:

if all_words_dict.get(word) != None:

all_words_dict[word] += 1

else:

all_words_dict[word] = 1

all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) # 按值降序排序

all_words_list = list(list(zip(*all_words_tuple_list))[0]) # all_words_list[word_one, word_two, ...]

return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

# 选取特征词

def words_dict(all_words_list, deleteN, stopwords_set=set()):

feature_words = []

n = 1

for t in range(deleteN, len(all_words_list), 1):

if n > 1000: # 维度最大1000

break

# 非数字 非停用词 长度 1-4 之间

if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:

feature_words.append(all_words_list[t])

n += 1

return feature_words

# 文本特征

def text_features(train_data_list, test_data_list, feature_words):

def text_feature_(text, feature_words):

text_words = set(text)

features = [1 if word in text_words else 0 for word in feature_words]

return features

train_feature_list = [text_feature_(text, feature_words) for text in train_data_list]

test_feature_list = [text_feature_(text, feature_words) for text in test_data_list]

return train_feature_list, test_feature_list

# 对停用词去重

def make_word_set(words_file):

words_set = set()

with open(words_file, 'r', encoding='UTF-8') as fp:

for line in fp.readlines():

word = line.strip()

if len(word)>0 and word not in words_set:

words_set.add(word)

return words_set

# 列表求均值

def average(accuracy_list):

sum = 0

for i in accuracy_list:

sum += i

return round(sum/len(accuracy_list),3)

# 分类 同时输出准确率等

def text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag):

if flag == 'nb':

# 朴素贝叶斯分类器 拟合 默认拉普拉斯平滑 不指定先验概率先验概率

classifier = NB().fit(train_feature_list, train_class_list)

if flag == 'lg':

# 逻辑回归分类器 指定liblinear为求解最优化问题的算法 最大迭代数 多分类问题策略

classifier = LR(solver='liblinear',max_iter=5000, multi_class='auto').fit(train_feature_list, train_class_list)

if flag == 'rf':

# 随机森林分类器

classifier = RF(n_estimators=200).fit(train_feature_list, train_class_list)

test_accuracy = classifier.score(test_feature_list, test_class_list) # 测试准确率

return test_accuracy

def start(flag):

folder_path = 'D:/WorkSpace/THUCTC/THUCNews/' # 请修改成自己的路径

all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = text_processor(folder_path, test_size=0.2)

stopwords_set = make_word_set('D:/WorkSpace/tmp/py/stop_words_cn.txt')

# 文本特征的提取和分类

deleteNs = range(0,1000,20)

test_accuracy_list = []

# 每循环一次,去除前 20 个最高词频,直到去除 980 个最高词频为止

for deleteN in deleteNs:

feature_words = words_dict(all_words_list, deleteN, stopwords_set)

train_feature_list, test_feature_list = text_features(train_data_list, test_data_list, feature_words)

if flag == 'nb':

test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag='nb')

if flag == 'lg':

test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag='lg')

if flag == 'rf':

test_accuracy = text_classifier(train_feature_list, test_feature_list, train_class_list, test_class_list, flag='rf')

test_accuracy_list.append(test_accuracy)

print(flag + '平均准确度:', average(test_accuracy_list))

print(flag + '最大准确度:', round(max(test_accuracy_list), 3))

return deleteNs, test_accuracy_list

if __name__ == "__main__":

plt.figure(figsize=(13, 11))

for i in range(5):

# 1

flag = 'nb'

nb_deleteNs, nb_accuracy_list = start(flag)

flag = 'lg'

lg_deleteNs, lg_accuracy_list = start(flag)

flag = 'rf'

rf_deleteNs, rf_accuracy_list = start(flag)

# 绘图

plt.title('Relationship of deleteNs and test_accuracy')

plt.xlabel('deleteNs')

plt.ylabel('test_accuracy')

plt.grid()

plt.plot(nb_deleteNs, nb_accuracy_list, 'b', label='nb')

plt.plot(lg_deleteNs, lg_accuracy_list, 'k', label='lg')

plt.plot(rf_deleteNs, rf_accuracy_list, 'r', label='rf')

plt.annotate('大', xy=((nb_accuracy_list.index(max(nb_accuracy_list))-1)*20, max(nb_accuracy_list)))

plt.annotate('大', xy=((lg_accuracy_list.index(max(lg_accuracy_list))-1)*20, max(lg_accuracy_list)))

plt.annotate('大', xy=((rf_accuracy_list.index(max(rf_accuracy_list))-1)*20, max(rf_accuracy_list)))

plt.legend()

plt.show()

运行结果:

1532687-20181203165014876-448963476.png

其他参数请自行修改

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值