《机器学习实战》个人学习记录笔记(九)———朴素贝叶斯之新浪新闻分类(Sklearn)

第四章 朴素贝叶斯

PS:个人笔记 根据《机器学习实战》这本书,Jack-Cui的博客,以及深度眸的视频进行学习

1 中文语句切分

import os
import jieba

def TextProcessing(folder_path):
    folder_list = os.listdir(folder_path)                         #查看folder_path下的文件列表
    data_list = []                                                #训练集
    class_list = []                                               #分类列表

    for folder in folder_list:                                    #遍历文件列表中的每个文件
        new_folder_path = os.path.join(folder_path, folder)       #根据子文件夹名称,生成子文件夹的路径
        files = os.listdir(new_folder_path)                       #存放子文件夹下的txt文件的列表名称
        j = 1
        for file in files:                                        #遍历每个TXT文件
            if j > 100:                                           #每类txt样本数最多100个
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:    #打开txt文件
                raw = f.read()                                    #读取文件
            word_cut = jieba.cut(raw, cut_all = False)            #精简模式,返回一个可迭代的generator
            word_list = list(word_cut)                            #将generator转换为list
            data_list.append(word_list)                           #加入数据列表中
            class_list.append(folder)                             #分类加入分类列表中
            j += 1
        print(data_list)
        print(class_list)
if __name__ == '__main__':
    folder_path = './SogouC/Sample'                               #训练集存放地址
    TextProcessing(folder_path)

2 文本特征选择

我们将所有文本分成训练集和测试集,并对训练集中的所有单词进行词频统计,并按降序排序。也就是将出现次数多的词语在前,出现次数少的词语在后进行排序。编写代码如下:

import os
import random
import jieba

"""
函数说明:中文文本处理
Parameters:
    folder_path - 文本存放的路径
    test_size - 测试集占比,默认占所有数据集的百分之20
Returns:
    all_words_list - 按词频降序排序的训练集列表
    train_data_list - 训练集列表
    test_data_list - 测试集列表
    train_class_list - 训练集标签列表
    test_class_list - 测试集标签列表
"""
def TextProcessing(folder_path, test_size = 0.2):
    folder_list = os.listdir(folder_path)                   
    data_list = []                                           
    class_list = []                                          

    #遍历每个子文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)        
        files = os.listdir(new_folder_path)                        

        j = 1
        for file in files:
            if j > 100:                                          
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:    
                raw = f.read()

            word_cut = jieba.cut(raw, cut_all = False)         
            word_list = list(word_cut)                          

            data_list.append(word_list)                        
            class_list.append(folder)                        
            j += 1

    data_class_list = list(zip(data_list, class_list))             #zip压缩合并,将数据与标签对应压缩
    random.shuffle(data_class_list)                                #将data_class_list乱序,原本是按顺序排的
    index = int(len(data_class_list) * test_size) + 1              #取训练集和测试集的分隔处索引
    train_list = data_class_list[index:]                           #训练集
    test_list = data_class_list[:index]                            #测试集
    train_data_list, train_class_list = zip(*train_list)           #训练集解压缩 zip(*)表示解压缩
    test_data_list, test_class_list = zip(*test_list)              #测试集解压缩

    all_words_dict = {}                                            #统计训练集词频
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict.keys():
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1

    #根据键的值倒序排序
    all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True) #用次数进行排序,并且是降序排列
    all_words_list, all_words_nums = zip(*all_words_tuple_list)    #解压缩
    all_words_list = list(all_words_list)                        #转换成列表
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

if __name__ == '__main__':
    folder_path = './SogouC/Sample'             
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
    print(all_words_list)

首先去掉高频词,至于去掉多少个高频词,我们可以通过观察去掉高频词个数和最终检测准确率的关系来确定。除此之外,去除数字,不把数字作为分类特征。同时,去除一些特定的词语,比如:”的”,”一”,”在”,”不”,”当然”,”怎么”这类的对新闻分类无影响的介词、代词、连词。

import os
import random
import jieba

def TextProcessing(folder_path, test_size = 0.2):
    folder_list = os.listdir(folder_path)                     
    data_list = []                                           
    class_list = []                                     

    #遍历每个子文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)  
        files = os.listdir(new_folder_path)                     

        j = 1
        #遍历每个txt文件
        for file in files:
            if j > 100:                                        
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:    
                raw = f.read()

            word_cut = jieba.cut(raw, cut_all = False)         
            word_list = list(word_cut)                       

            data_list.append(word_list) 
            class_list.append(folder)                       
            j += 1

    data_class_list = list(zip(data_list, class_list))       
    random.shuffle(data_class_list)                              
    index = int(len(data_class_list) * test_size) + 1        
    train_list = data_class_list[index:]                    
    test_list = data_class_list[:index]                    
    train_data_list, train_class_list = zip(*train_list)      
    test_data_list, test_class_list = zip(*test_list)        

    all_words_dict = {}                                      
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict.keys():
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1
    all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True)
    all_words_list, all_words_nums = zip(*all_words_tuple_list)   
    all_words_list = list(all_words_list)                   
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

"""
函数说明:读取文件里的内容,并去重,去停词表

Parameters:
    words_file - 文件路径
Returns:
    words_set - 读取的内容的set集合
"""
def MakeWordsSet(words_file):
    words_set = set()                                            #创建set集合
    with open(words_file, 'r', encoding = 'utf-8') as f:         #打开文件,只读
        for line in f.readlines():                               #一行一行读取
            word = line.strip()                                  #除去空白符
            if len(word) > 0:                                    #有文本,则添加到words_set中
                words_set.add(word)                               
    return words_set                                             #返回处理结果

"""
函数说明:文本特征选取
Parameters:
    all_words_list - 训练集所有文本列表
    deleteN - 删除词频最高的deleteN个词
    stopwords_set - 指定的结束语
Returns:
    feature_words - 特征集
"""
def words_dict(all_words_list, deleteN, stopwords_set = set()):
    feature_words = []                            #特征列表
    n = 1
    for t in range(deleteN, len(all_words_list), 1):      #deleteN之前的相当于删除了,之后的保存,按步长1一个个遍历
        if n > 1000:                                      #feature_words的维度为1000,最多也只去1000个
            break                               
        #如果这个词不是数字,并且不是指定的结束语,并且单词长度大于1小于5,那么这个词就可以作为特征词
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
        n += 1
    return feature_words

if __name__ == '__main__':
    folder_path = './SogouC/Sample'           
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
    stopwords_file = './stopwords_cn.txt'
    stopwords_set = MakeWordsSet(stopwords_file)
    feature_words = words_dict(all_words_list, 100, stopwords_set)
    print(feature_words)

这个feature_words就是我们最终选出的用于新闻分类的特征。随后,我们就可以根据feature_words,将文本向量化,然后用于训练朴素贝叶斯分类器

3 使用Sklearn构建朴素贝叶斯分类器

sklearn.naive_bayes中的MultinomialNB

对于新闻分类,属于多分类问题。我们可以使用MultinamialNB()完成我们的新闻分类问题。

MultinomialNB假设特征的先验概率为多项式分布,即如下式:


其中,P(Xj = Xjl | Y = Ck)是第k个类别的第j维特征的第l个取值条件概率。mk是训练集中输出为第k类的样本个数。λ为一个大于0的常数,尝尝取值为1,即拉普拉斯平滑,也可以取其他值。


参数说明如下:

  • alpha:浮点型可选参数,默认为1.0,其实就是添加拉普拉斯平滑,即为上述公式中的λ ,如果这个参数设置为0,就是不添加平滑;
  • fit_prior:布尔型可选参数,默认为True。布尔参数fit_prior表示是否要考虑先验概率,如果是false,则所有的样本类别输出都有相同的类别先验概率。否则可以自己用第三个参数class_prior输入先验概率,或者不输入第三个参数class_prior让MultinomialNB自己从训练集样本来计算先验概率,此时的先验概率为P(Y=Ck)=mk/m。其中m为训练集样本总数量,mk为输出为第k类别的训练集样本数。
  • class_prior:可选参数,默认为None。
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import os
import random
import jieba

def TextProcessing(folder_path, test_size = 0.2):
    folder_list = os.listdir(folder_path)                    
    data_list = []                                         
    class_list = []                                        
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)    
        files = os.listdir(new_folder_path)                      
        j = 1
        for file in files:
            if j > 100:                                     
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:  
                raw = f.read()

            word_cut = jieba.cut(raw, cut_all = False)          
            word_list = list(word_cut)                   
            data_list.append(word_list)                       
            class_list.append(folder)                     
            j += 1
    data_class_list = list(zip(data_list, class_list))          
    random.shuffle(data_class_list)                           
    index = int(len(data_class_list) * test_size) + 1        
    train_list = data_class_list[index:]                
    test_list = data_class_list[:index]                 
    train_data_list, train_class_list = zip(*train_list)      
    test_data_list, test_class_list = zip(*test_list)           
    all_words_dict = {}                                   
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict.keys():
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1
    all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True)
    all_words_list, all_words_nums = zip(*all_words_tuple_list) 
    all_words_list = list(all_words_list)              
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

def MakeWordsSet(words_file):
    words_set = set()                                        
    with open(words_file, 'r', encoding = 'utf-8') as f:  
        for line in f.readlines():                       
            word = line.strip()                            
            if len(word) > 0:                 
                words_set.add(word)                               
    return words_set                                      

"""
函数说明:根据feature_words将文本向量化

Parameters:
    train_data_list - 训练集
    test_data_list - 测试集
    feature_words - 特征集
Returns:
    train_feature_list - 训练集向量化列表
    test_feature_list - 测试集向量化列表
"""
def TextFeatures(train_data_list, test_data_list, feature_words):
    def text_features(text, feature_words):                        #出现在特征集中,则置1                                               
        text_words = set(text)
        features = [1 if word in text_words else 0 for word in feature_words]
        return features
    train_feature_list = [text_features(text, feature_words) for text in train_data_list]
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]
    return train_feature_list, test_feature_list         

def words_dict(all_words_list, deleteN, stopwords_set = set()):
    feature_words = []                      
    n = 1
    for t in range(deleteN, len(all_words_list), 1):
        if n > 1000:                   
            break                               
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
        n += 1
    return feature_words

"""
函数说明:新闻分类器

Parameters:
    train_feature_list - 训练集向量化的特征文本
    test_feature_list - 测试集向量化的特征文本
    train_class_list - 训练集分类标签
    test_class_list - 测试集分类标签
Returns:
    test_accuracy - 分类器精度
"""
def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
    classifier = MultinomialNB().fit(train_feature_list, train_class_list)
    test_accuracy = classifier.score(test_feature_list, test_class_list)
    return test_accuracy

if __name__ == '__main__':
    folder_path = './SogouC/Sample'             
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
    stopwords_file = './stopwords_cn.txt'
    stopwords_set = MakeWordsSet(stopwords_file)


    test_accuracy_list = []
    deleteNs = range(0, 1000, 20)                #因为要去掉一些高频词,但是具体多少尧通过测试,可以通过这个函数测试,然后可视化表示出来
    for deleteN in deleteNs:
        feature_words = words_dict(all_words_list, deleteN, stopwords_set)
        train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
        test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
        test_accuracy_list.append(test_accuracy)

    plt.figure()
    plt.plot(deleteNs, test_accuracy_list)
    plt.title('Relationship of deleteNs and test_accuracy')
    plt.xlabel('deleteNs')
    plt.ylabel('test_accuracy')
    plt.show()

绘制出了deleteNs和test_accuracy的关系,这样我们就可以大致确定去掉前多少的高频词汇了。每次运行程序,绘制的图形可能不尽相同,我们可以通过多次测试,来决定这个deleteN的取值,然后确定这个参数,这样就可以顺利构建出用于新闻分类的朴素贝叶斯分类器了

if __name__ == '__main__':
    folder_path = './SogouC/Sample'             
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)

    stopwords_file = './stopwords_cn.txt'
    stopwords_set = MakeWordsSet(stopwords_file)


    test_accuracy_list = []
    feature_words = words_dict(all_words_list, 450, stopwords_set)
    train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
    test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
    test_accuracy_list.append(test_accuracy)
    ave = lambda c: sum(c) / len(c)
    print(ave(test_accuracy_list))



阅读更多
文章标签: python
个人分类: 机器学习
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭