基于概率论的分类方法:朴素贝叶斯
最近在学朴素贝叶斯,主要是看统计学习方法与机器学习实战这两本书。
在学习朴素贝叶斯之前,读者需要先复习一下概率论里的相关知识。
朴素贝叶斯的学习与分类
1、基本方法
2、后验概率最大化的含义
朴素贝叶斯法的参数估计
1、极大似然估计
2、学习与分类算法
3、贝叶斯估计
具体内容请读者阅读《统计学习方法》
在《机器学习实战》一书中,具体讲解了朴素贝叶斯分类的案例与具体实现代码,请读者阅读后理解一下代码。
源代码如下:
#-*- coding:utf-8 -*-
from numpy import *
import re
import chardet
import multiprocessing
from multiprocessing import Pool #多进程
#from math import *
################词表到向量的转换函数#########################
"""
def loadDataSet():
postingList=[['my','dog','has','flea',\
'problems','help','please'],
['maybe','not','take','him',\
'to','dog','park','stupid'],
['my','dalmation','is','so','cute',\
'I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how',\
'to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1] #1:侮辱性文字,0:正常言论
return postingList,classVec #postingList:词条切割后的文档集合,classVec:标签类的集合
"""
def loadData(fileName):
try:
#trainList=open(fileName).read()
fr = open(fileName).readlines()
except:
print "打开文件异常"
return 0
pos=[];classVec=[]
for line in fr:
pos.append(line.decode('gbk','ignore'))
classVec.append(0) #0是正常
print pos[0],pos[1]#.encode('utf-8')
print classVec
def testTextParse(filename,classify):
text = open(filename).read()
pattern = '<text>(.*?)</text>'
str_list = re.findall(pattern, text, re.S) #re.S :多行匹配
doc_list = []
ptn = re.compile('\\s*')#\s是指空白,包括空格、换行、tab缩进等所有的空白, 可以把正则表达式编译成一个正则表达式对象。可以把那些经常使用的正则表达式编译成正则表达式对象,这样可以提高一定的效率。
for doc in str_list:
doc = ptn.split(doc)
doc_list.append([term for term in doc if len(term)>=1 and term != ','and term != '.'and term != '!'and term != '?'and term != '('and term != ')'
and term != '\"'and term != '\''
and term != '\xa1\xa3' and term != '\xa3\xac' and term != '\xa3\xbf'and term != '\xa3\xa1'and term != '\xa3\xbb' #'\xa1\xa3':。 '\xa3\xac':,'\xa3\xbf':? '\xa3\xbb' :分号
and term != '\xa3\xba'and term != '\xa1\xb0'and term != '\xa1\xb1'and term != '\xa1\xae'and term != '\xa1\xaf'
and term != '\xa3\xa8'and term != '\xa3\xa9'and term != '\xa1\xa2'
])
# for i in range(len(doc_list[0])):
# print doc_list[0][i].decode('gbk')#.encode('utf-8')
if classify==0:
classVec=zeros( len(doc_list))
else :
classVec=ones(len(doc_list))
print 'class',classify,':len of doc_list',len(doc_list),' ,len of classVec',len(classVec)
print classVec
return doc_list,classVec
def testText(filename):
text = open(filename).read()
pattern = '<text>(.*?)</text>'
str_list = re.findall(pattern, text, re.S) #re.S :多行匹配
doc_list = []
ptn = re.compile('\\s*')#\s是指空白,包括空格、换行、tab缩进等所有的空白, 可以把正则表达式编译成一个正则表达式对象。可以把那些经常使用的正则表达式编译成正则表达式对象,这样可以提高一定的效率。
for doc in str_list:
doc = ptn.split(doc)
doc_list.append([term for term in doc if len(term)>=1 and term != ','and term != '.'and term != '!'and term != '?'and term != '('and term != ')'
and term != '\"'and term != '\''
and term != '\xa1\xa3' and term != '\xa3\xac' and term != '\xa3\xbf'and term != '\xa3\xa1'and term != '\xa3\xbb' #'\xa1\xa3':。 '\xa3\xac':,'\xa3\xbf':? '\xa3\xbb' :分号
and term != '\xa3\xba'and term != '\xa1\xb0'and term != '\xa1\xb1'and term != '\xa1\xae'and term != '\xa1\xaf'
and term != '\xa3\xa8'and term != '\xa3\xa9'and term != '\xa1\xa2'
])
for i in range(len(doc_list[0])):
#if chardet.detect(doc_list[0][i])['encoding'] != "utf-8":#检测txt编码格式
# print doc_list[0][i].decode('gbk'),#.encode('utf-8')
# else :
print doc_list[0][i].decode('utf-8'),
return doc_list
#创建一个包含在所有文档中出现的不重复词的列表,使用set数据类型,将词条列表输给set构造函数,就会返回一个不重复表
def createVocabList(dataSet):
vocabSet=set([]) #创建一个空集
for document in dataSet:
vocabSet=vocabSet | set(document ) #创建2个集合的并集
return list(vocabSet)
######## 词集模型 ##########
#将每个词的出现与否作为一个特征,这称为词集模型(set-of-words model)
def setOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print "the word: %s is not in my Vocabulary!"% word
return returnVec
######### 词袋模型 ############
#如果一个词在文档中不止出现一次,这可能意味着包含该词是否出现在文档中所不能表达的某种信息,这种方法称为词袋模型(bag-of-words model)
def bagOfWords2VecMN(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
################朴素贝叶斯分类器训练函#################
"""
# input:文档矩阵trainMatrix,由每篇文档类别标签所构成的向量trainCategory
# 首先,计算文档属于class=1的概率(侮辱性文档),即P(1),P(0)=1-P(1)
# 计算p(wi|c1)和p(wi|c0),初始化程序中的分子变量和分母变量。由于w中元素众多,可以使用NUmpy数组快速计算这些值。
# 上述程序中的坟墓变量是一个元素个数等于词汇表大小的NumPy数组。
# 在for循环中,遍历训练集trainMatrix的所有文档,一旦某个词语出现,则该词对应的个数(p1Num或者p0Num)就加1,
# 而在所有文档中,该文档的总词数也加1,对于两个类别要同样处理。
# 最后,对每个元素除以该类别中的总词数。利用NumPy可以很好实现,用一个数组除以浮点数即可。
# 最后,函数返回2个向量和1个概率。
"""
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix) #一共记录条数,如2000条 trainMatrix是2000*8330的,每条记录都转为8330长的0,1向量,出现的词为1,没出现的为0
numWords = len(trainMatrix[0]) #第一条的长度为8330(每条长度都一样)
print 'len(trainMatrix[0]):',numWords
pAbusive = sum(trainCategory)/float(numTrainDocs) #class=1的概率 = sum(class=1)/总记录数,sum([1,2,3])=6,sum(trainCategory)计算class=1的记录数 即p(ci)
#p0Num = zeros(numWords);p1Num=zeros(numWords)
#p0Denom = 0.0;p1Denom =0.0
p0Num = ones(numWords);p1Num=ones(numWords)#计算多个概率的乘积,p(w0|1)*p(w1|1)*p(w2|1)...如果有一个概率值为0,那么整体就是0,所以把所有词的出现次数初始化为1,并将分母初始化为2,为什么??????
p0Denom = 2.0;p1Denom =2.0
for i in range (numTrainDocs):
if trainCategory[i] == 1: #第i条记录是class=1的话,其实,这就是已知class=1的条件下,下面算的就是条件概率!!!!!!!!!!!
p1Num += trainMatrix[i] #条件独立性假设,p1Num是每个词条出现次数的累和,总共8330个词条,最后可能p1Num=[1,2,0,34,0,...0,2]
p1Denom += sum(trainMatrix[i]) #增加class=1中所有词条的计数 ,和p1Num中数字全加起来有什么区别?????????好像没区别->就是没区别
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# p1Vect = p1Num / p1Denom 这是算p(wi|c1)的矩阵,已知在class=1的条件下,w0的概率就是w0/(class=1时总的词条数)
#p0Vect = p0NUm / p1Denom p(wi|c1)的矩阵
print "len(p1Num):",len(p1Num)
print "len(p0Num):",len(p0Num)
p1Vect = log(p1Num/p1Denom) #change to log() p(w0|1)*p(w1|1)*p(w2|1)..很多极小的数相乘,最后四舍五入会得到0,造成下溢出,所以取对数
p0Vect = log(p0Num/p0Denom) #change to log()
return p0Vect,p1Vect,pAbusive
########## 贝叶斯分类器 #########
"""
# vec2Classify:要分类的向量
# p0Vec:p(wi|c0)
# p1Vec:p(wi|c1)
# pClass1:p(c1)
# p(ci|w)正比于p(w|ci)*p(ci),因为分母相同,都是p(w)
"""
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #对应元素相乘,然后将所有词的对应值相加,然后将该值加到类别的对数概率上 logA+logB+logC=log(A*B*C)
p0 = sum(vec2Classify * p0Vec) + log(1.0 -pClass1)
if p1 >p0 :
return 1 #"差评"
else:
return 0 #"好评"
def testingNB():
listPosts,listClasses=loadDataSet()
myVocabList = createVocabList(listPosts)
trainMat=[]
for postinDoc in listPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb=trainNB0(trainMat,listClasses)
testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as :',classifyNB(thisDoc,p0V,p1V,pAb)
testEntry=['stupid','garbage']
thisDoc =array(setOfWords2Vec(myVocabList,testEntry))
print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
### 装载数据集 ###
#返回 listPosts 数据集列表
# listClasses 分类列表
def loadDataSet():
listPosts,listClasses=testTextParse("dataset/positive.txt",0) #0:positive
a=chardet.detect(listPosts[0][0])
print a
print 'positive len(listPosts):',len(listPosts) #2000
print "positive listClasses:",listClasses
listPosts1,listClasses1=testTextParse("dataset/negative.txt",1) #1:negative
print ' negative len(listPosts1):',len(listPosts1)
print "negative listClasses1:",listClasses1
listPosts += listPosts1
listClasses =list(listClasses)+list(listClasses1)
listClasses = array(listClasses)
print "positive+negative:len(listPosts):",len(listPosts) #4000
print "positive+negative:len(listClasses):",len(listClasses)
print "listClasses:",listClasses
#listPosts是得到的列表
# (一条评论,一条评论listPosts[0]=距离 川沙 公路 较 近 但是 公交 指示 不 对 如果 是 蔡陆线 的话 会 非常 麻烦 建议 用 别 的 路线 房间 较为 简单
# listPosts[1],,,)
return listPosts,listClasses
def testingNBChinese(fileName):
listPosts,listClasses = loadDataSet()
myVocabList=createVocabList(listPosts) #得到词典
print 'len(myVocabList):',len(myVocabList) #0:8330 1:15844-8330
#### 以下3行可以删,随机抽出几个看看词典是不是已经建立 #####
for i in range(len(myVocabList)/1000):
#if chardet.detect(myVocabList[i])
a=myVocabList[i].decode('utf-8','ignore')
print a
#### 以下4行可以删,看一下第一条记录中词汇在词典中的位置 ####
returnVec=setOfWords2Vec(myVocabList,listPosts[0]) #词汇转为在词典中出现的位置,向量
for i in range(len(returnVec)):
if returnVec[i]==1:
print i,myVocabList[i].decode('utf-8','ignore')
#trainMat=[]
#for postinDoc in listPosts: #在4000条记录中
# trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #每条记录都转为向量
#save("dataset/4000Vec.npy",trainMat)
trainMat=load("dataset/4000Vec.npy")#4000条记录的向量
print trainMat[0],len(trainMat[0])#15844->编码改成utf-8以后,词典长度变为 15837了
for i in range(len(trainMat[0])):
if trainMat[0][i]==1:
print i,
p0V,p1V,pClass1=trainNB0(trainMat,listClasses)
#testEntry=testText("dataset/testDatasetPositive.txt")
testEntry=testText(fileName)
for i in range(len(testEntry)):
testDoc=setOfWords2Vec(myVocabList,testEntry[i])
print "len of testDoc:",len(testDoc)
for i in range(len(testDoc)):
if testDoc[i]==1:
print i,myVocabList[i].decode('utf-8','ignore'),
print 'testEntry classified as: ',classifyNB(testDoc,p0V,p1V,pClass1)
testEntry=testText("dataset/testDatasetNegative.txt")
testDoc=setOfWords2Vec(myVocabList,testEntry[0])
print "len of testDoc:",len(testDoc)
for i in range(len(testDoc)):
if testDoc[i]==1:
print i,myVocabList[i].decode('utf-8','ignore'),
print 'testEntry classified as: ',classifyNB(testDoc,p0V,p1V,pClass1)
print "OK"
######## 交叉验证 ###########
def crossVarify():
#for k in range(10): #10次交叉验证取平均
#print "第",k,"次测试:"
testSet=[]
listPosts,listClasses = loadDataSet()
#trainSet=listPosts
print len(listPosts)
trainSet=range(len(listPosts)) #得到一个listPosts长度的列表,便于后面的计算(del操作,del不能删除numpy.array的元素)
#print trainSet
myVocabList = createVocabList(listPosts) #得到词典
print "len(myVocabList):",len(myVocabList)
from functools import partial
# partial_setOfWords2Vec = partial(setOfWords2Vec,vocabList =myVocabList) #对函数进行预赋值??大概这意思
#取400个当测试集
for i in range(400):
randIndex = int(random.uniform(0,len(trainSet))) #返回 0 - len(trainSet)之间的一个随机数
testSet.append(trainSet[randIndex])
del(trainSet[randIndex]) #把测试数据从训练数据里移除
print "testSet :",testSet
trainMat=[];trainClasses=[]
### for循环:得到训练数据集 ###
# for docIndex in trainSet:
# trainMat.append(setOfWords2Vec(vocabList,listPosts[docIndex]))
# trainClasses.append(listClasses[docIndex])
print "multiprocesses start:"
pool = Pool(processes=8) # start 8 worker processes
# map.async:非阻塞,
# partial(setOfWords2Vec,myVocabList):把myVocabList赋给函数的第一个参数,
# 如果写成partial(setOfWords2Vec,vocabList = myVocabList) 会报错 got multi argument
# [listPosts[docIndex] for docIndex in trainSet] :获取listPosts中指定的列
# 相当于 for docIndex in trainSet: listPosts[docIndex]
#.get(120):获得pool.map_async()的结果,120秒以后获取不到就退出
trainMat.append(pool.map_async(partial(setOfWords2Vec,myVocabList),[listPosts[docIndex] for docIndex in trainSet] ).get(120))
pool.close()
pool.join()
print "trainMat.append OK, multiprocesses end."
print "len(trainMat):",len(trainMat)
for docIndex in trainSet:
trainClasses.append(listClasses[docIndex])
# for i in range(len(trainMat[0])) :
# if trainMat[0][i] != 0:
# a = trainMat[0][i]
# print i,a
# print "trainMat[0]=",trainMat[0]
print "trainClasses:",trainClasses
#p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #不用多进程,就用trainMat,
p0V,p1V,pSpam = trainNB0(array(trainMat[0]),array(trainClasses)) #用多进程,就用trainMat[0]
errorCount = 0
print "errorCount=0"
for docIndex in testSet:
wordVector = setOfWords2Vec(myVocabList,listPosts[docIndex])
testClassify = classifyNB(array(wordVector),p0V,p1V,pSpam)
print "第%d个测试分类:%d,实际分类:%d" %(docIndex,testClassify,listClasses[docIndex])
if testClassify != listClasses[docIndex]:
errorCount += 1
print errorCount
print 'the error rate is :',float(errorCount)/len(testSet)
if __name__ == '__main__':
multiprocessing.freeze_support()
crossVarify()
######## 文件解析及完整的垃圾邮件测试函数 ##############
### textParse()函数接受一个大字符串并将其解析为字符串列表 ###
def textParse(bigString):
import re
listOfTokens = re.split(r'\W',bigString)
return [tok.lower() for tok in listOfTokens if len(tok)>2]
def spamTest():
docList=[];classList=[];fullText=[]
#for循环:导入并解析文本文件
for i in range(1,26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet =range(50);testSet=[]
#在50封邮件中随机选出10封作为测试集,选出数字所对应的文档添加到testSet,同时从trainingSet中删除
#这种随机选择数据的一部分作为训练集,而剩余部分作为测试集的过程称为存留交叉验证 hold-out cross validation
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[];trainClasses=[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount+=1
print 'the error rate is :',float(errorCount)/len(testSet)
######## RSS源分类器及高频词去除函数 ##############
def calMostFreq(vocabList,fullText):
import operator
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.iteritems(), key = operator.itemgetter(1),reverse=True)
return sortedFreq[:30]
def localWords(feed1,feed0):
import feedparser
docList=[];classList=[];fullText=[]
minLen = min(len(feed1['entries']),len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.append(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calMostFreq(vocabList,fullText)
for pairW in top30Words:
if pairW[0] in vocabList:vocabList.remove(piarW[0])
trainingSet= range(2*minLen);testSet=[]
for i in range(20):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[];trainClasses=[]
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print 'the error rate is :',float(errorCount)/len(testSet)
return vocabList,p0V,p1V