1、相关公式:朴素贝叶斯最核心的部分是贝叶斯法则,而贝叶斯法则的基石是条件概率
- 贝叶斯法则:
- 条件概率:
2、实现代码
# coding:utf-8
from math import log
from numpy import ones,zeros
import numpy as np
def loadDataSet():
# 六句话
postingList=[['my','dog','has','flea','problem','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
# 六句话对应的类型,0为普通类型语言,1为侮辱性语言
classVec=[0,1,0,1,0,1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]=1
else:print("the word :%s is not in my vocabulary"%word)
return returnVec
def bagOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]+=1
else:print("the word :%s is not in my vocabulary"%word)
return returnVec
def trainNB(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs) #统计侮辱性文档的总个数,然后除以总文档个数
p0Num=zeros(numWords);p1Num=zeros(numWords) # 把属于同一类的文本向量加起来
p0Denom=0.0;p1Denom=0.0
# 避免出现0值
# p0Num=ones(numWords);p1Num=ones(numWords)
# p0Denom=2.0;p1Denom=2.0
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]#把属于同一类的文本向量相加,实质是统计某个词条在该类文本中出现频率
p1Denom+=sum(trainMatrix[i]) #把侮辱性文档向量的所有元素加起来
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vec=p1Num/float(p1Denom)
p0Vec=p0Num/float(p0Denom)
# 防止出现下溢出
# p1Vec=[log(x) for x in p1Num/p1Denom] #统计词典中所有词条在侮辱性文档中出现的概率
# p0Vec=[log(x) for x in p0Num/p0Denom] #统计词典中所有词条在正常文档中出现的概率
return pAbusive,p0Vec,p1Vec
def classifyNB(vec2classify,p0Vec,p1Vec,pClass1):
p1=sum(vec2classify*p1Vec)+log(pClass1)
p0=sum(vec2classify*p0Vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
if __name__=='__main__':
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
pAb,p0V,p1V = trainNB(trainMat, listClasses)
print(myVocabList)
print(pAb)
print(p0V)
print(p1V)