sklearn基于贝叶斯算法进行新闻文本分类

所需数据集在搜狗实验室即可下载

http://www.sogou.com/labs/resource/ca.php

import jieba
import pandas as pd
import re
import numpy as np
from wordcloud import WordCloud
from jieba import analyse
import warnings
warnings.filterwarnings("ignore")
from gensim import corpora, models, similarities
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

class NewClassify(object):

    def __init__(self):
        #原始数据集路径
        self.data_set_path = "./dataset/news.txt"
        #停用词列表
        self.cease_words_table = open("./dataset/cease_words_table.txt", "r", encoding="utf-8").read().split()


    #数据预处理
    def pre_process_dataset(self):
        with open(self.data_set_path, "r", encoding="utf-8") as fp:
            #读取原始数据
            news_txt = fp.read()
            #提取新闻内容和标题
            news_contents = re.findall(r"<content>(.*)</content>", news_txt)
            #将内容进行分词
            news_contents = list(map(lambda s: jieba.lcut(s), news_contents))
            titles = re.findall(r"<contenttitle>(.*)</contenttitle>", news_txt)
            #制作新闻分好词的dataframe
            df_content = pd.DataFrame({"content_S": news_contents})
            #去除停用词
            contents = df_content.content_S.values.tolist()
            #存储去除停用词后的新闻内容列表
            contents_clean = []
            #存储新闻出现的全部非停用词的列表
            all_words = []
            #遍历新闻内容剔除停用词
            for line in contents:
                line_clean = []
                for word in line:
                    if word in self.cease_words_table:
                        continue
                    line_clean.append(word)
                    all_words.append(word)
                contents_clean.append(line_clean)
            #返回处理后的新闻列表和出现的全部非停用词
            return contents_clean, all_words


    #统计词频
    def statistic_word_frequency(self, de_all_words):

        #对全部词进行分组后执行聚合函数统计每个词的出现次数
        words_count = df_all_words.groupby(by=["all_words"])["all_words"].agg({"count": np.size})
        #对统计表进行重新排列,按照count值降序
        words_count = words_count.reset_index().sort_values(by=["count"], ascending=False)
        return words_count

    #绘制词云
    def paint_word_cloud(self, words_count):
        pass


    #提取关键词
    def extract_abstract_words(self, df_content):
        news = df_content["contents_clean"][10]
        news_str = "".join(news)
        print(news_str)
        abstract_words = analyse.extract_tags(news_str, topK=5, withWeight=False)
        print(abstract_words)

    #LDA主题模型
    def LDA_model(self, contents_clean):
        #做映射,即每个词映射成唯一的数值
        dictionary = corpora.Dictionary(contents_clean)
        #根据映射表将新闻列表映射成数值列表
        #corpus中每个元组代表每个词,元组中第一个值代表该词在映射表中的数值,第二个词代表该次在该文档中出现的次数
        corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
        #建立LDA主题模型
        lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
        #打印20个主题
        for topic in lda.print_topics(num_topics=20, num_words=5):
            print(topic)


    def lable_news_process(self, contents_clean):
        lables = []
        lable = ["汽车","财经","科技","健康","体育","教育","文化",",军事","娱乐","时尚"]
        for i in range(20):
            lables += lable
        df_train = pd.DataFrame({"contents_clean": contents_clean, "lable": lables})
        return df_train


    def lable_map(self, df_train):
        lable_map = {"汽车": 1,"财经": 2,"科技": 3,"健康": 4,"体育": 5,"教育": 6,"文化": 7,",军事": 8,"娱乐": 9,"时尚": 0}
        df_train["lable"] = df_train["lable"].map(lable_map)
        return df_train





if __name__ == '__main__':
    nc = NewClassify()
    #获取预处理后的新闻内容列表
    contents_clean, all_words = nc.pre_process_dataset()
    #制作新闻内容的Dataframe
    df_content = pd.DataFrame({"contents_clean": contents_clean})
    #制作全部词的DataFrame
    df_all_words = pd.DataFrame({"all_words": all_words})
    # #统计词频
    # words_count = nc.statistic_word_frequency(df_all_words)
    # #提取关键词
    # nc.extract_abstract_words(df_content)
    #LDA建模
    #nc.LDA_model
    #给新闻数据打标签
    df_train = nc.lable_news_process(contents_clean)
    #将标签映射成数值
    df_train = nc.lable_map(df_train)
    #利用sklearn进行新闻分类
    #划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(df_train["contents_clean"].values, df_train["lable"].values)
    #制作词向量
    words = []
    for line_index in range(len(x_train)):
        try:
            words.append(" ".join(x_train[line_index]))
        except:
            print(line_index)

    vec = CountVectorizer(analyzer="word", max_features=4000, lowercase=False)
    vec.fit(words)
    #实例化一个贝叶斯分类器
    classifier = MultinomialNB()
    #传入数据进行训练
    classifier.fit(vec.transform(words), y_train)
    #利用测试机数据进行模型测试
    test_words = []
    for line_index in range(len(x_test)):
        try:
            test_words.append(" ".join(x_test[line_index]))
        except:
            print(line_index)
    #打印准确率
    print(classifier.score(vec.transform(test_words), y_test))
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值