《机器学习实战》之朴素贝叶斯

基于概率论的分类方法:朴素贝叶斯

优点:在数据较少的情况下仍然有效,可以处理多类别问题。

缺点:对于输入数据的准备方式较为敏感。

适用数据类型:标称型数据。


适用条件概率进行分类。


# -*- coding: utf-8 -*-
from numpy import *

# --Step1--
# 1> loadDataSet(): 加载实验样本
def loadDataSet():
    postingList = [['my','dog','has','flea','problems','help','please'],     # 7
                   ['maybe','not','take','him','to','dog','park','stupid'],  # 8
                   ['my','dalmation','is','so','cute','I','love','him'],     # 8
                   ['stop','posting','stupid','worthless','garbage'],        # 5
                   ['mr','licks','ate','my','steak','how','to','stop','him'],# 9
                   ['quit','buying','worthless','dog','food','stupid']]      # 6
    classVec = [0,1,0,1,0,1]                    # 1 代表侮辱性文字,0代表正常言论
    return postingList, classVec
    
# 2> createVocabList(dataSet): 创建词汇表(每个单词唯一),包含所有文档中出现的不重复列表   
def createVocabList(dataSet):
    vocabSet = set([])                      #创建一个空集
    for document in dataSet:
        vocabSet = vocabSet | set(document) #创建两个集合的并集
    return list(vocabSet)

# 3> setOfWords2Vec(vocabList, inputSet): 输出文档向量,向量的每个元素为1或0, 1出现, 0未出现
# 词集模型(set-of-words model), 每个词只能出现一次!
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)       #创建一个所有元素值为0的向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print "the word: %s is not in my Vocabulary!" % word
    return returnVec
 
 # 4> bagOfWords2Vec(vocabList, inputSet): 输出文档向量
 # 词袋模型(bag-of-words model),每个词可以出现不止一次
def bagOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)       #创建一个所有元素值为0的向量
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1 # 唯一不同与setOfWords2Vec代码
        else:
            print "the word: %s is not in my Vocabulary!" % word
    return returnVec
    
          
# --Step2--
# trainMatrix : 文档矩阵,由文档向量构成!!!
# trainCategory : 由每篇文档类别标签所构成的向量(0: 正常言论  1: 侮辱言论)
#
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)                  # trainMatrix矩阵行数 numTrainDocs
    numWords = len(trainMatrix[0])                   # trainMatrix矩阵列数 numWords
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords);                         # numWords长的单位向量, 防止出现条件概率为0时,所求概率值为0
    p1Num = ones(numWords); 
    p0Denom = 2.0;                                  # 分母初始化2.0
    p1Denom = 2.0;
    for i in range(numTrainDocs):# 
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]                  # 向量相加
            p1Denom += sum(trainMatrix[i])           # p1Denom总和
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom) #change to log() 防止很多小数相乘导致下溢出
    p0Vect = log(p0Num/p0Denom) #change to log()
    return p0Vect,p1Vect,pAbusive

# --Step3--

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) # 元素相乘
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    
    testEntry = ['love','my','dalmation']
    thisDoc   = array(setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)

    testEntry = ['stupid','garbage']
    thisDoc   = array(setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ', classifyNB(thisDoc,p0V,p1V,pAb)    
  


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Digital2Slave

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值