《机器学习实战》读书笔记6：朴素贝叶斯源码

最新推荐文章于 2022-12-28 15:53:58 发布

imxietx

最新推荐文章于 2022-12-28 15:53:58 发布

阅读量1.3k

点赞数

分类专栏： Machine Learning 《机器学习实战》读书笔记文章标签：机器学习朴素贝叶斯贝叶斯分类器源码

本文链接：https://blog.csdn.net/artprog/article/details/56688874

版权

Machine Learning 同时被 2 个专栏收录

25 篇文章 8 订阅

订阅专栏

《机器学习实战》读书笔记

6 篇文章 11 订阅

订阅专栏

下面是经过我注释并添加 docstring 的朴素贝叶斯分类器源码

from numpy import *

def loadDataSet():
    ''' () -> list of lists, list

    Return a list of lists which are some posts in a forum and
    a list containing the corresponding class of each post

    >>> postingList, classVec = loadDataSet()
    >>> postingList
    [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 
     ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 
     ['my',          'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 
     ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 
     ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 
     ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    >>> classVec
    [0, 1, 0, 1, 0, 1]
    '''

    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]

    return postingList, classVec


def createVocabList(dataSet):
    ''' (list of list) -> list

    Return a list that contains non-duplicate elements from dataSet

    >>> createVocabList([['apple', 'is', 'good', 'for', 'health'], ['he', 'is', 'a', 'fan', 'of', 'apple']])
    ['a', 'good', 'apple', 'for', 'of', 'is', 'health', 'fan', 'he']
    '''

    vocabList = set([])

    for document in dataSet:
        vocabList = vocabList | set(document)

    return list(vocabList)


def setOfWords2Vec(vocabList, inputSet):
    ''' (list, list) -> list

    Return a list containing 0s ans 1s. If a word from inputSet is in the vocabList,
    the corresponding position in returnVec of the word will be set as 1, or 0.

    >>> setOfWords2Vec(['apple', 'pen', 'good'], ['pen', 'is', 'a', 'good', 'writing', 'tool'])
    [0, 1, 1]
    '''

    returnVec = [0] * len(vocabList)

    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1

    return returnVec


def bagOfWords2Vec(vocabList, inputSet):
    ''' (list, list) -> list

    Return a list containing the number of times each word of inputSet appears in vocabList.

    >>> bagOfWords2Vec(['apple', 'pen', 'good'], ['pen', 'is', 'a', 'good', 'writing', 'tool', 'pen'])
    [0, 2, 1]
    '''

    returnVec = [0] * len(vocabList)

    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1

    return returnVec


def trainNB0(trainMatrix, trainCategory):
    ''' (list of lists, list) -> 1D-array, 1D-array, float

    Return the probabilities of each word appearing in abusive posts
    and non-abusive posts and the ratio of abusive posts.

    trainMatrix is a list of lists containing 0s and 1s, and each 0
    or 1 indecates weither a word appears in the corresponding post.

    trainCategroy is a list of ints (only 0 and 1). 0 means that the 
    corresponding post is not abusive, and 1 means that the corresponding
    post is abusive.

    >>> trainMatrix = [[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1], 
                       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0], 
                       [1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1], 
                       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 
                       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1], 
                       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0]]
    >>> trainCategory = [0, 1, 0, 1, 0, 1]
    >>> p0, p1, pa = trainNB0(trainMatrix, trainCategory)
    >>> p0
    array([ 0.04166667,  0.04166667,  0.04166667,  0.        ,  0.        ,
            0.04166667,  0.04166667,  0.04166667,  0.        ,  0.04166667,
            0.04166667,  0.04166667,  0.04166667,  0.        ,  0.        ,
            0.08333333,  0.        ,  0.        ,  0.04166667,  0.        ,
            0.04166667,  0.04166667,  0.        ,  0.04166667,  0.04166667,
            0.04166667,  0.        ,  0.04166667,  0.        ,  0.04166667,
            0.04166667,  0.125     ])
    >>> p1
    array([ 0.        ,  0.        ,  0.        ,  0.05263158,  0.05263158,
            0.        ,  0.        ,  0.        ,  0.05263158,  0.05263158,
            0.        ,  0.        ,  0.        ,  0.05263158,  0.05263158,
            0.05263158,  0.05263158,  0.05263158,  0.        ,  0.10526316,
            0.        ,  0.05263158,  0.05263158,  0.        ,  0.10526316,
            0.        ,  0.15789474,  0.        ,  0.05263158,  0.        ,
            0.        ,  0.        ])
    >>> pa
    0.5
    '''

    numTrainDocs = len(trainMatrix) # the number of posts
    numWords = len(trainMatrix[0])  # the number of words in a single post
    pAbusive = sum(trainCategory) / float(numTrainDocs) # the probability of abusive posts
    p0Num = ones(numWords) # the number of times each word appears in non-abusive posts
    p1Num = ones(numWords) # the number of times each word appears in abusive posts
    p0Denom = 2 # the total number of words in non-abusive posts
    p1Denom = 2 # the total number of words in abusive posts

    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]        # accumulate the number of times each word appears in abusive posts
            p1Denom += sum(trainMatrix[i]) # accumulate the total number of words in non-abusive posts
        else:
            p0Num += trainMatrix[i]        # accumulate the number of times each word appears in abusive posts
            p0Denom += sum(trainMatrix[i]) # accumulate the total number of words in non-abusive posts

    # change the probabilities to log to avoid precision and overflow problems
    p1Vect = log(p1Num / p1Denom) # calculate the conditional probabilities of each word in abusive posts
    p0Vect = log(p0Num / p0Denom) # calculate the conditional probabilities of each word in non-abusive posts

    return p0Vect, p1Vect, pAbusive


def classifyNB(vec2classify, p0Vect, p1Vect, pClass1):
    ''' (list, list, list, float) -> str

    Return 'abusive' if vev2classify is the vector of abusive post.
    Return 'not abusive' is vev2classify is the vector of non-abusive post.

    Bayes theorem: P(A|B)         = P(B|A) * P(A) / P(B)
                   P(abusive)     = p1Vect * pClass1 / P(each word in vec2classify)
                   P(non-abusive) = p0Vect * (1-pClass1) / P(each word in vec2classify)

    >>> classifyNB(vec2classify, p0Vect, p1Vect, pClass1)
    '''

    pAbusive = sum(vec2classify * p1Vect) + log(pClass1)
    pNonAbusive = sum(vec2classify * p0Vect) + log(1 - pClass1)

    if pAbusive > pNonAbusive:
        return 'Abusive post'
    return 'Non-abusive post'


def testingNB():
    ''' () -> NoneType

    Self-contained test.

    >>> testingNB()
    ['love', 'my', 'dalmation'] classified as: Non-abusive post
    ['stupid', 'grabage'] classified as: Abusive post
    '''

    listOfPosts, listOfClasses = loadDataSet()
    vocabList = createVocabList(listOfPosts)
    trainMatrix = []

    for post in listOfPosts:
        trainMatrix.append(setOfWords2Vec(vocabList, post))

    p0V, p1V, pAb = trainNB0(array(trainMatrix), array(listOfClasses))

    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(vocabList, testEntry))
    print testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)

    testEntry = ['stupid', 'grabage']
    thisDoc = array(setOfWords2Vec(vocabList, testEntry))
    print testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)