机器学习实战:朴素贝叶斯

from numpy import *

def load():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec


def creatList(dataSet):
    vocabSet = set([])
    for d in dataSet:
        vocabSet = vocabSet | set(d)
    return list(vocabSet)


def word2Vec(vocList, dataSet):
    returnVec = [0] * len(vocList)
    for word in dataSet:
        if word in vocList:
            returnVec[vocList.index(word)] = 1
        else:
            print('the word %s is not in vocab list! '% word)
    return returnVec


def trainNB(data, class_):
    # print('data : \n', data)
    numDoc = len(data)
    numWord = len(data[0])
    pa = sum(class_)/float(numDoc)
    # 拉普拉斯平滑,分子加1,分母加n
    num0 = ones(numWord)  # numpy中的函数
    num1 = ones(numWord)
    p0 = 2.0
    p1 = 2.0
    for i in range(numDoc):
        # print('data[i] : \n', data[i])
        if class_[i] == 1:
            num1 += data[i]
            p1 += sum(data[i])
            # print('num1 : \n', num1)
            # print(p1)
        else:
            num0 += data[i]
            p0 += sum(data[i])
            # print('num0 : \n', num0)
            # print(p0)
    p1Vect = log(num1/p1)
    p0Vect = log(num0/p0)
    return p0Vect, p1Vect, pa


def classifyNB(vec, p0Vec, p1Vec, pa_):
    p0 = sum(vec * p0Vec) + log(pa_)
    p1 = sum(vec * p1Vec) + log(1-pa_)
    if p0 > p1:
        return 0
    else:
        return 1


def testNB(testList):
    postList, classList = load()
    vocabList = creatList(postList)
    print(vocabList)
    # print(len(vocabList))

    trainMat = []
    for postinDoc in postList:
        trainMat.append(word2Vec(vocabList, postinDoc))
    # print('trainMat : \n', trainMat)
    p0, p1, pa = trainNB(trainMat, classList)
    print(p0, '\n', p1, '\n pa = \t', pa)

    testVec = word2Vec(vocabList, testList)
    testClass = classifyNB(testVec, p0, p1, pa)
    print('该样本的分类为 :\t', testClass)


def bagWord2Vec(vocList, dataSet):
    returnVec = [0] * len(vocList)
    for word in dataSet:
        if word in vocList:
            returnVec[vocList.index(word)] += 1
        else:
            print('the word %s is not in vocab list! ' % word)
    return returnVec


if __name__ == '__main__':

    test1 = ['stupid', 'dalmation']
    testNB(test1)

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值