《机器学习实战》个人学习记录笔记(八)———朴素贝叶斯实战篇之垃圾邮件分类

第四章 朴素贝叶斯

PS:个人笔记 根据《机器学习实战》这本书,Jack-Cui的博客,以及深度眸的视频进行学习

1 两个改进

拉普拉斯平滑(Laplace Smoothing)又被称为加1平滑,是比较常用的平滑方法,它就是为了解决0概率问题

下溢出:这是由于太多很小的数相乘造成的。为了解决这个问题,对乘积结果取自然对数。通过求对数可以避免下溢出或者浮点数舍入导致的错误。同时,采用自然对数进行处理不会有任何损失。

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)                            
    numWords = len(trainMatrix[0])                           
    pAbusive = sum(trainCategory)/float(numTrainDocs)       
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)    
    p0Denom = 2.0; p1Denom = 2.0                            ⭐#分母初始化为2,拉普拉斯平滑
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:                          
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:                                           
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)                          ⭐#取对数,防止下溢出         
    p0Vect = np.log(p0Num/p0Denom)         
    return p0Vect,p1Vect,pAbusive                          

2 过滤垃圾邮件

import re

"""
Parameters:
    无
Returns:
    无
"""
def textParse(bigString):                                                   #将文本解析为字符串列表
    listOfTokens = re.split(r'\W.*?', bigString)                            #将'\w'作为分隔符,获得单个单词;⭐改为.*? 
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]            #对于单词长度做要求并且规定小写

"""
Parameters:
    dataSet - 整理的样本数据集
Returns:
    vocabSet - 返回不重复的词条列表,也就是词汇表
"""
def createVocabList(dataSet):
    vocabSet = set([])                      #创建一个空的不重复列表,利用set()的不重复功能
    for document in dataSet:               
        vocabSet = vocabSet | set(document) #删除重复的单词,组成词汇表
    return list(vocabSet)

if __name__ == '__main__':
    docList = []; classList = []
    for i in range(1, 26):                                                  #遍历文件,一共有26  
        wordList = textParse(open('spam/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())  #⭐打开文件,用之前定义的函数,这里有个编码问题
        docList.append(wordList)
        classList.append(1)                                                 #标记垃圾邮件,1表示垃圾文件
        wordList = textParse(open('ham/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())     #这里打开非垃圾邮件,并字符串转换成字符串列表
        docList.append(wordList)
        classList.append(0)                                                 #标记非垃圾邮件,1表示垃圾文件   
    vocabList = createVocabList(docList)                                    #创建词汇表,不重复
    print(vocabList)

文本向量化,我们将数据集分为训练集和测试集,使用交叉验证的方式测试朴素贝叶斯分类器的准确性。

import numpy as np
import random
import re

def createVocabList(dataSet):
    vocabSet = set([])                   
    for document in dataSet:               
        vocabSet = vocabSet | set(document) 
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)                                
    for word in inputSet:                                             
        if word in vocabList:                                         
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec                                                  

def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)                                     
    for word in inputSet:                                             
        if word in vocabList:                                      
            returnVec[vocabList.index(word)] += 1
    return returnVec                                                 

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)                          
    numWords = len(trainMatrix[0])                         
    pAbusive = sum(trainCategory)/float(numTrainDocs)      
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)    
    p0Denom = 2.0; p1Denom = 2.0                            
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:                           
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:                                              
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)                        
    p0Vect = np.log(p0Num/p0Denom)         
    return p0Vect,p1Vect,pAbusive                           

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)        #对应元素相乘。logA * B = logA + logB,所以这里加上log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def textParse(bigString):                                              
    listOfTokens = re.split(r'\W.*?', bigString)                       #这里改为,*?     
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]       


def spamTest():                                                             #样本测试
    docList = []; classList = []; fullText = []
    for i in range(1, 26):                                             
        wordList = textParse(open('spam/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())     #读取每个垃圾邮件,并字符串转换成字符串列表⭐编码有问题做了改变
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)                                             
        wordList = textParse(open('ham/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())      
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)                                                 #标记非垃圾邮件,1表示垃圾文件   
    vocabList = createVocabList(docList)                                    #创建词汇表,不重复
    trainingSet = list(range(50)); testSet = []                             #创建存储训练集的索引值的列表和测试集的索引值的列表                       
    for i in range(10):                                                     #从50个邮件中,随机挑选出40个作为训练集,10个做测试集
        randIndex = int(random.uniform(0, len(trainingSet)))                #随机选取索索引值,是均匀随机函数
        testSet.append(trainingSet[randIndex])                              #添加测试集的索引值
        del(trainingSet[randIndex])                                         #在训练集列表中删除添加到测试集的索引值
    trainMat = []; trainClasses = []                                        #创建训练集矩阵和训练集类别标签系向量             
    for docIndex in trainingSet:                                            #遍历训练集
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))       #将生成的词集模型添加到训练矩阵中
        trainClasses.append(classList[docIndex])                            #将类别添加到训练集类别标签系向量中
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))  #训练朴素贝叶斯模型,要矩阵运算,用np转换
    errorCount = 0                                                          #错误分类计数
    for docIndex in testSet:                                                #遍历测试集
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])           #测试集的词集模型
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:    #如果分类错误
            errorCount += 1                                                 #错误计数加1
            print("分类错误的测试集:",docList[docIndex])
    print('错误率:%.2f%%' % (float(errorCount) / len(testSet) * 100))


if __name__ == '__main__':
    spamTest()






阅读更多
想对作者说点什么?

博主推荐

换一批

没有更多推荐了,返回首页