朴素贝叶斯分类-CSDN博客

本文链接：https://blog.csdn.net/AxeChen/article/details/88743721

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import os
import shutil
import jieba
import pickle
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import savefile, readfile, readbunchobj, writebunchobj
from sklearn.naive_bayes import MultinomialNB  # 导入多项式贝叶斯算法
from sklearn import metrics
from sklearn.externals import joblib


# 中文分词
def corpus_segment(corpus_path, seg_path):      # corpus_path是未分词语料库路径，seg_path是分词后语料库存储路径
    catelist = os.listdir(corpus_path)  # 获取语料库下的所有一级目录，目录名字就是类别
    print("分词中...")
    if os.path.exists(seg_path):
        shutil.rmtree(seg_path)
    for mydir in catelist:      # 获取一级目录目录（类别）下所有的文件，mydir是类别名
        class_path = corpus_path + mydir + "/"      # 拼出分类子目录的路径如：train_corpus/art/
        seg_dir = seg_path + mydir + "/"        # 拼出分词后存贮的对应目录路径如：train_corpus_seg/art/
        if not os.path.exists(seg_dir):     # 是否存在分词目录，如果没有则创建该目录
            os.makedirs(seg_dir)
        else:
            shutil.rmtree(seg_dir)
            os.makedirs(seg_dir)
        file_list = os.listdir(class_path)      # 获取未分词语料库中某一类别中的所有文本，即xxx.txt
        for file_path in file_list:  # 遍历类别目录下的所有文件
            fullname = class_path + file_path  # 拼出文件名全路径如：train_corpus/art/21.txt
            content = readfile(fullname) # 读取文件内容，此时，content里面存贮的是原文本的所有字符，例如多余的空格、空行、回车等等，
            content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除换行
            content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除换行
            content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除换行
            content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除空行、多余的空格
            content_seg = jieba.cut(content)  # 为文件内容分词
            savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8'))  # 将处理后的文件保存到分词后语料目录
    print("中文语料分词结束！！！")


# Bunch化操作
def corpus2Bunch(wordbag_path, seg_path):
    catelist = os.listdir(seg_path)     # 获取已分词语料库下的所有子目录，也就是分类信息
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])      # 创建一个Bunch实例
    bunch.target_name.extend(catelist)      # extend(addlist)是python list中的函数，意思是用新的list（addlist）去扩充原来的list
    for mydir in catelist:          # 获取一级目录列表（各大类）
        class_path = seg_path + mydir + "/"     # 拼出各分类路径
        file_list = os.listdir(class_path)      # 获取各类中具体txt文件名称列表
        for file_path in file_list:     # 遍历各类包括的txt文件
            fullname = class_path + file_path       # 拼出文件名全路径
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(readfile(fullname))       # 读取文件内容
    with open(wordbag_path, "wb") as file_obj:          # 将bunch存储到wordbag_path路径中
        pickle.dump(bunch, file_obj)
    print("构建文本对象结束！！！")


# 创建向量空间
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
    stpwrdlst = readfile(stopword_path).splitlines()
    bunch = readbunchobj(bunch_path)        # 读取分词后的bunch
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
                       vocabulary={})       # 创建目标bunch用于存储tfidf,vocabulary是词表，tdm是tfidf值
    if train_tfidf_path is not None:
        trainbunch = readbunchobj(train_tfidf_path)     # 处理测试集时，导入已经训练的训练集
        tfidfspace.vocabulary = trainbunch.vocabulary       # 测试集的bunch.vacabulary直接采用训练集的
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True,       # 构建一个vectorizermax_df=0.5
                                     vocabulary=trainbunch.vocabulary)      # vectorizer的词表也采用训练集的词表，这么做就无需训练太多词
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)       # 计算tfidf,将结果存入目标bunch.tdm
    else:
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True)       # 构建一个vectorizermax_df=0.5
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)       # 计算tfidf,将结果存入目标bunch.tdm
        tfidfspace.vocabulary = vectorizer.vocabulary_      # bunch.vacabulary 为vectorizer的词表
    writebunchobj(space_path, tfidfspace)       # 将tfidfspace存入指定路径
    print("tf-idf词向量空间实例创建成功！！！")


if __name__ == "__main__":
    train_corpus_path = "train_corpus/"  # 未分词分类语料库路径
    # test_corpus_path = "test_corpus/"  # 未分词分类语料库路径
    stopword_path = "stopword.txt"  # 加载停用词
    train_seg_path = "train_corpus_seg/"  # 分词后分类语料库路径
    train_bunch_path = "train_word_bag/train_set.dat"  # Bunch存储路径
    train_space_path = "train_word_bag/tfdifspace.dat"

    if not os.path.exists('clf_model_chinese.m'):
        # 对训练集进行分词
        corpus_segment(train_corpus_path, train_seg_path)

        # 对训练集进行Bunch化操作
        if not os.path.exists('train_word_bag/'):
            os.makedirs('train_word_bag/')
        corpus2Bunch(train_bunch_path, train_seg_path)

        # 创建训练集tfidf词向量空间
        vector_space(stopword_path, train_bunch_path, train_space_path)

        '''# 对测试集进行分词
        test_seg_path = "test_corpus_seg/"  # 分词后分类语料库路径
        corpus_segment(test_corpus_path, test_seg_path)

        # 对测试集进行Bunch化操作
        test_bunch_path = "test_word_bag/test_set.dat"  # Bunch存储路径
        if not os.path.exists('test_word_bag/'):
            os.makedirs('test_word_bag/')
        corpus2Bunch(test_bunch_path, test_seg_path)

        # 创建测试集tfidf词向量空间
        test_space_path = "test_word_bag/tfdifspace.dat"
        vector_space(stopword_path, test_bunch_path, test_space_path, train_space_path)'''

        # 导入训练集
        train_set = readbunchobj(train_space_path)

        # 训练分类器：输入词袋向量和分类标签，alpha:0.001 alpha越小，迭代次数越多，精度越高
        clf = MultinomialNB(alpha=0.0001).fit(train_set.tdm, train_set.label)
        joblib.dump(clf, "clf_model_chinese.m")
    else:
        print('已完成过训练')
    '''
    # 导入测试集
    testpath = "test_word_bag/tfdifspace.dat"
    test_set = readbunchobj(testpath)

    # 模型载入
    clf = joblib.load('clf_model_chinese.m')

    # 预测分类结果
    predicted = clf.predict(test_set.tdm)

    # 输出分类结果
    for flabel, file_name, expct_cate in zip(test_set.label, test_set.filenames, predicted):
        print(file_name, " -->预测类别:", expct_cate)
    print("预测完毕!!!")

    # 计算分类精度
    def metrics_result(actual, predict):
        print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict, average='weighted')))
        print('召回:{0:.3f}'.format(metrics.recall_score(actual, predict, average='weighted')))
        print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted')))
    metrics_result(test_set.label, predicted)'''

tools.py

#!usr/bin/env python
# -*- coding:utf-8 _*-

import pickle
import os
import re
import shutil
import jieba
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer


# 保存至文件
def savefile(savepath, content):
    with open(savepath, "wb") as fp:
        fp.write(content)


# 读取文件
def readfile(path):
    with open(path, "rb") as fp:
        content = fp.read()
    return content


def writebunchobj(path, bunchobj):
    with open(path, "wb") as file_obj:
        pickle.dump(bunchobj, file_obj)


# 读取bunch对象
def readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch

classification.py

import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import readfile, readbunchobj
from sklearn.externals import joblib

def Model(modelpath):
    clf = joblib.load(modelpath)
    return clf

def ClassificationCN(text, stoppath="stopword.txt",modelpath='clf_model_chinese.m', vocabularypath="train_word_bag/tfdifspace.dat"):
    '''模型载入'''
    clf = Model(modelpath)
    '''文本分类'''
    text = text.encode('utf-8')
    text = text.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除换行
    text = text.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除空行、多余的空格
    text_seg = jieba.cut(text)  # 为文件内容分词  结果是generator
    text = ""
    for ge in text_seg:
        text = text + " " + ge    #将结果转化为STR
    text = [text]   #str转化为list
    trainbunch = readbunchobj(vocabularypath)  # 处理测试集时，导入已经训练的训练集
    stpwrdlst = readfile(stoppath).splitlines()
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary)
    tfidf = vectorizer.fit_transform(text)  # 计算tfidf
    predicted = clf.predict(tfidf)
    predicted = predicted[0]
    return predicted