朴素贝叶斯

基于概率论的分类方法:朴素贝叶斯

最近在学朴素贝叶斯,主要是看统计学习方法与机器学习实战这两本书。
在学习朴素贝叶斯之前,读者需要先复习一下概率论里的相关知识。

朴素贝叶斯的学习与分类

1、基本方法
2、后验概率最大化的含义

朴素贝叶斯法的参数估计

1、极大似然估计
2、学习与分类算法
3、贝叶斯估计

具体内容请读者阅读《统计学习方法》

在《机器学习实战》一书中,具体讲解了朴素贝叶斯分类的案例与具体实现代码,请读者阅读后理解一下代码。

源代码如下:

#-*- coding:utf-8 -*-
from numpy import *
import re
import chardet
import multiprocessing
from multiprocessing import Pool  #多进程

#from math import *


################词表到向量的转换函数#########################
"""
def loadDataSet():
    postingList=[['my','dog','has','flea',\
                  'problems','help','please'],
                 ['maybe','not','take','him',\
                  'to','dog','park','stupid'],
                 ['my','dalmation','is','so','cute',\
                 'I','love','him'],
                 ['stop','posting','stupid','worthless','garbage'],
                 ['mr','licks','ate','my','steak','how',\
                  'to','stop','him'],
                 ['quit','buying','worthless','dog','food','stupid']]
    classVec=[0,1,0,1,0,1]  #1:侮辱性文字,0:正常言论
    return postingList,classVec  #postingList:词条切割后的文档集合,classVec:标签类的集合
"""

def loadData(fileName):
    try:
        #trainList=open(fileName).read()
        fr = open(fileName).readlines()
    except:
        print "打开文件异常"
        return 0
    pos=[];classVec=[]
    for line in fr:
        pos.append(line.decode('gbk','ignore'))
        classVec.append(0)   #0是正常
    print pos[0],pos[1]#.encode('utf-8')
    print classVec

def testTextParse(filename,classify):
    text = open(filename).read()
    pattern = '<text>(.*?)</text>'
    str_list = re.findall(pattern, text, re.S)  #re.S :多行匹配
    doc_list = []
    ptn = re.compile('\\s*')#\s是指空白,包括空格、换行、tab缩进等所有的空白, 可以把正则表达式编译成一个正则表达式对象。可以把那些经常使用的正则表达式编译成正则表达式对象,这样可以提高一定的效率。

    for doc in str_list:
        doc = ptn.split(doc)
        doc_list.append([term for term in doc if len(term)>=1 and term != ','and term != '.'and term != '!'and term != '?'and term != '('and term != ')'
                         and term != '\"'and term != '\''
                         and term != '\xa1\xa3' and term != '\xa3\xac' and term != '\xa3\xbf'and term != '\xa3\xa1'and term != '\xa3\xbb' #'\xa1\xa3':。  '\xa3\xac':,'\xa3\xbf':? '\xa3\xbb' :分号
                         and term != '\xa3\xba'and term != '\xa1\xb0'and term != '\xa1\xb1'and term != '\xa1\xae'and term != '\xa1\xaf'
                         and term != '\xa3\xa8'and term != '\xa3\xa9'and term != '\xa1\xa2'
                         ])
    # for i in range(len(doc_list[0])):
    #     print doc_list[0][i].decode('gbk')#.encode('utf-8')
    if classify==0:
        classVec=zeros( len(doc_list))
    else :
        classVec=ones(len(doc_list))
    print 'class',classify,':len of doc_list',len(doc_list),' ,len of classVec',len(classVec)
    print classVec

    return doc_list,classVec

def testText(filename):
    text = open(filename).read()
    pattern = '<text>(.*?)</text>'
    str_list = re.findall(pattern, text, re.S)  #re.S :多行匹配
    doc_list = []
    ptn = re.compile('\\s*')#\s是指空白,包括空格、换行、tab缩进等所有的空白, 可以把正则表达式编译成一个正则表达式对象。可以把那些经常使用的正则表达式编译成正则表达式对象,这样可以提高一定的效率。

    for doc in str_list:
        doc = ptn.split(doc)
        doc_list.append([term for term in doc if len(term)>=1 and term != ','and term != '.'and term != '!'and term != '?'and term != '('and term != ')'
                         and term != '\"'and term != '\''
                         and term != '\xa1\xa3' and term != '\xa3\xac' and term != '\xa3\xbf'and term != '\xa3\xa1'and term != '\xa3\xbb' #'\xa1\xa3':。  '\xa3\xac':,'\xa3\xbf':? '\xa3\xbb' :分号
                         and term != '\xa3\xba'and term != '\xa1\xb0'and term != '\xa1\xb1'and term != '\xa1\xae'and term != '\xa1\xaf'
                         and term != '\xa3\xa8'and term != '\xa3\xa9'and term != '\xa1\xa2'
                         ])
    for i in range(len(doc_list[0])):
        #if chardet.detect(doc_list[0][i])['encoding'] != "utf-8":#检测txt编码格式
       # print doc_list[0][i].decode('gbk'),#.encode('utf-8')
       # else :
            print doc_list[0][i].decode('utf-8'),
    return doc_list



#创建一个包含在所有文档中出现的不重复词的列表,使用set数据类型,将词条列表输给set构造函数,就会返回一个不重复表
def createVocabList(dataSet):
    vocabSet=set([])     #创建一个空集
    for document in dataSet:
        vocabSet=vocabSet | set(document )       #创建2个集合的并集
    return list(vocabSet)

########  词集模型  ##########
#将每个词的出现与否作为一个特征,这称为词集模型(set-of-words model)
def setOfWords2Vec(vocabList,inputSet):
    returnVec=[0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print "the word: %s is not in my Vocabulary!"% word
    return returnVec

######### 词袋模型    ############
#如果一个词在文档中不止出现一次,这可能意味着包含该词是否出现在文档中所不能表达的某种信息,这种方法称为词袋模型(bag-of-words model)
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec




################朴素贝叶斯分类器训练函#################
"""
# input:文档矩阵trainMatrix,由每篇文档类别标签所构成的向量trainCategory
# 首先,计算文档属于class=1的概率(侮辱性文档),即P(1),P(0)=1-P(1)
# 计算p(wi|c1)和p(wi|c0),初始化程序中的分子变量和分母变量。由于w中元素众多,可以使用NUmpy数组快速计算这些值。
# 上述程序中的坟墓变量是一个元素个数等于词汇表大小的NumPy数组。
# 在for循环中,遍历训练集trainMatrix的所有文档,一旦某个词语出现,则该词对应的个数(p1Num或者p0Num)就加1,
# 而在所有文档中,该文档的总词数也加1,对于两个类别要同样处理。
# 最后,对每个元素除以该类别中的总词数。利用NumPy可以很好实现,用一个数组除以浮点数即可。
# 最后,函数返回2个向量和1个概率。
"""
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)   #一共记录条数,如2000条  trainMatrix是2000*8330的,每条记录都转为8330长的0,1向量,出现的词为1,没出现的为0
    numWords = len(trainMatrix[0])    #第一条的长度为8330(每条长度都一样)
    print 'len(trainMatrix[0]):',numWords
    pAbusive = sum(trainCategory)/float(numTrainDocs)      #class=1的概率 = sum(class=1)/总记录数,sum([1,2,3])=6,sum(trainCategory)计算class=1的记录数  即p(ci)
    #p0Num = zeros(numWords);p1Num=zeros(numWords)
    #p0Denom = 0.0;p1Denom =0.0
    p0Num = ones(numWords);p1Num=ones(numWords)#计算多个概率的乘积,p(w0|1)*p(w1|1)*p(w2|1)...如果有一个概率值为0,那么整体就是0,所以把所有词的出现次数初始化为1,并将分母初始化为2,为什么??????
    p0Denom = 2.0;p1Denom =2.0
    for i in  range (numTrainDocs):
        if trainCategory[i] == 1:       #第i条记录是class=1的话,其实,这就是已知class=1的条件下,下面算的就是条件概率!!!!!!!!!!!
            p1Num += trainMatrix[i]     #条件独立性假设,p1Num是每个词条出现次数的累和,总共8330个词条,最后可能p1Num=[1,2,0,34,0,...0,2]
            p1Denom += sum(trainMatrix[i])   #增加class=1中所有词条的计数 ,和p1Num中数字全加起来有什么区别?????????好像没区别->就是没区别
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # p1Vect = p1Num / p1Denom      这是算p(wi|c1)的矩阵,已知在class=1的条件下,w0的概率就是w0/(class=1时总的词条数)
    #p0Vect = p0NUm / p1Denom       p(wi|c1)的矩阵
    print "len(p1Num):",len(p1Num)
    print "len(p0Num):",len(p0Num)
    p1Vect = log(p1Num/p1Denom)   #change to log()  p(w0|1)*p(w1|1)*p(w2|1)..很多极小的数相乘,最后四舍五入会得到0,造成下溢出,所以取对数
    p0Vect = log(p0Num/p0Denom)  #change to log()
    return p0Vect,p1Vect,pAbusive

##########   贝叶斯分类器  #########
"""
# vec2Classify:要分类的向量
# p0Vec:p(wi|c0)
# p1Vec:p(wi|c1)
# pClass1:p(c1)
# p(ci|w)正比于p(w|ci)*p(ci),因为分母相同,都是p(w)
"""
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)   #对应元素相乘,然后将所有词的对应值相加,然后将该值加到类别的对数概率上  logA+logB+logC=log(A*B*C)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 -pClass1)
    if p1 >p0 :
        return  1  #"差评"
    else:
        return  0  #"好评"

def testingNB():
    listPosts,listClasses=loadDataSet()
    myVocabList = createVocabList(listPosts)
    trainMat=[]
    for postinDoc in listPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb=trainNB0(trainMat,listClasses)
    testEntry = ['love','my','dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,'classified as :',classifyNB(thisDoc,p0V,p1V,pAb)
    testEntry=['stupid','garbage']
    thisDoc =array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)

###    装载数据集    ###
#返回  listPosts      数据集列表
#      listClasses    分类列表
def loadDataSet():
    listPosts,listClasses=testTextParse("dataset/positive.txt",0)  #0:positive
    a=chardet.detect(listPosts[0][0])
    print a
    print 'positive len(listPosts):',len(listPosts)  #2000
    print "positive listClasses:",listClasses
    listPosts1,listClasses1=testTextParse("dataset/negative.txt",1)  #1:negative
    print ' negative len(listPosts1):',len(listPosts1)
    print "negative listClasses1:",listClasses1

    listPosts += listPosts1
    listClasses =list(listClasses)+list(listClasses1)
    listClasses = array(listClasses)
    print "positive+negative:len(listPosts):",len(listPosts)  #4000
    print "positive+negative:len(listClasses):",len(listClasses)
    print "listClasses:",listClasses
    #listPosts是得到的列表
    # (一条评论,一条评论listPosts[0]=距离 川沙 公路 较 近 但是 公交 指示 不 对 如果 是 蔡陆线 的话 会 非常 麻烦 建议 用 别 的 路线 房间 较为 简单
    # listPosts[1],,,)
    return listPosts,listClasses




def testingNBChinese(fileName):
    listPosts,listClasses = loadDataSet()
    myVocabList=createVocabList(listPosts)  #得到词典
    print 'len(myVocabList):',len(myVocabList)  #0:8330  1:15844-8330
    ####  以下3行可以删,随机抽出几个看看词典是不是已经建立  #####
    for i in range(len(myVocabList)/1000):
        #if chardet.detect(myVocabList[i])
        a=myVocabList[i].decode('utf-8','ignore')
        print a

    ####  以下4行可以删,看一下第一条记录中词汇在词典中的位置  ####
    returnVec=setOfWords2Vec(myVocabList,listPosts[0])  #词汇转为在词典中出现的位置,向量
    for i in range(len(returnVec)):
        if returnVec[i]==1:
            print i,myVocabList[i].decode('utf-8','ignore')

    #trainMat=[]
    #for postinDoc in listPosts:  #在4000条记录中
    #       trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #每条记录都转为向量

    #save("dataset/4000Vec.npy",trainMat)
    trainMat=load("dataset/4000Vec.npy")#4000条记录的向量
    print trainMat[0],len(trainMat[0])#15844->编码改成utf-8以后,词典长度变为 15837了
    for i in range(len(trainMat[0])):
        if trainMat[0][i]==1:
            print i,
    p0V,p1V,pClass1=trainNB0(trainMat,listClasses)
    #testEntry=testText("dataset/testDatasetPositive.txt")
    testEntry=testText(fileName)
    for i in range(len(testEntry)):
        testDoc=setOfWords2Vec(myVocabList,testEntry[i])
        print "len of testDoc:",len(testDoc)
        for i in range(len(testDoc)):
            if testDoc[i]==1:
                print i,myVocabList[i].decode('utf-8','ignore'),
        print 'testEntry classified as: ',classifyNB(testDoc,p0V,p1V,pClass1)

    testEntry=testText("dataset/testDatasetNegative.txt")
    testDoc=setOfWords2Vec(myVocabList,testEntry[0])
    print "len of testDoc:",len(testDoc)
    for i in range(len(testDoc)):
        if testDoc[i]==1:
            print i,myVocabList[i].decode('utf-8','ignore'),
    print 'testEntry classified as: ',classifyNB(testDoc,p0V,p1V,pClass1)

    print "OK"

########         交叉验证         ###########
def crossVarify():
    #for k in range(10):    #10次交叉验证取平均
        #print "第",k,"次测试:"
        testSet=[]
        listPosts,listClasses = loadDataSet()

        #trainSet=listPosts
        print len(listPosts)
        trainSet=range(len(listPosts))   #得到一个listPosts长度的列表,便于后面的计算(del操作,del不能删除numpy.array的元素)
        #print trainSet
        myVocabList = createVocabList(listPosts)     #得到词典
        print "len(myVocabList):",len(myVocabList)

        from functools import partial
       # partial_setOfWords2Vec = partial(setOfWords2Vec,vocabList =myVocabList)  #对函数进行预赋值??大概这意思

        #取400个当测试集
        for i in range(400):
            randIndex = int(random.uniform(0,len(trainSet)))  #返回 0 - len(trainSet)之间的一个随机数
            testSet.append(trainSet[randIndex])

            del(trainSet[randIndex])  #把测试数据从训练数据里移除

        print "testSet :",testSet
        trainMat=[];trainClasses=[]
        ### for循环:得到训练数据集  ###
        # for docIndex in trainSet:
        #     trainMat.append(setOfWords2Vec(vocabList,listPosts[docIndex]))
        #     trainClasses.append(listClasses[docIndex])
        print "multiprocesses start:"
        pool = Pool(processes=8)              # start 8 worker processes
        # map.async:非阻塞,
        # partial(setOfWords2Vec,myVocabList):把myVocabList赋给函数的第一个参数,
        # 如果写成partial(setOfWords2Vec,vocabList = myVocabList) 会报错 got multi argument
        # [listPosts[docIndex] for docIndex in trainSet] :获取listPosts中指定的列
        # 相当于 for docIndex in trainSet:  listPosts[docIndex]
        #.get(120):获得pool.map_async()的结果,120秒以后获取不到就退出
        trainMat.append(pool.map_async(partial(setOfWords2Vec,myVocabList),[listPosts[docIndex] for docIndex in trainSet] ).get(120))

        pool.close()
        pool.join()

        print "trainMat.append OK, multiprocesses end."

        print "len(trainMat):",len(trainMat)

        for docIndex in trainSet:
            trainClasses.append(listClasses[docIndex])
        # for i in range(len(trainMat[0])) :
        #    if trainMat[0][i] != 0:
        #        a = trainMat[0][i]
        #        print i,a
        # print "trainMat[0]=",trainMat[0]

        print "trainClasses:",trainClasses
        #p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))    #不用多进程,就用trainMat,
        p0V,p1V,pSpam = trainNB0(array(trainMat[0]),array(trainClasses))  #用多进程,就用trainMat[0]
        errorCount = 0
        print "errorCount=0"
        for docIndex in testSet:
            wordVector = setOfWords2Vec(myVocabList,listPosts[docIndex])
            testClassify = classifyNB(array(wordVector),p0V,p1V,pSpam)
            print "第%d个测试分类:%d,实际分类:%d" %(docIndex,testClassify,listClasses[docIndex])
            if testClassify != listClasses[docIndex]:
                errorCount += 1


        print errorCount
        print 'the error rate is :',float(errorCount)/len(testSet)

if __name__ == '__main__':
    multiprocessing.freeze_support()

    crossVarify()

########          文件解析及完整的垃圾邮件测试函数    ##############
###  textParse()函数接受一个大字符串并将其解析为字符串列表 ###
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]


def spamTest():
    docList=[];classList=[];fullText=[]
    #for循环:导入并解析文本文件
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet =range(50);testSet=[]
    #在50封邮件中随机选出10封作为测试集,选出数字所对应的文档添加到testSet,同时从trainingSet中删除
    #这种随机选择数据的一部分作为训练集,而剩余部分作为测试集的过程称为存留交叉验证  hold-out cross validation
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[];trainClasses=[]
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount+=1
    print 'the error rate is :',float(errorCount)/len(testSet)

########         RSS源分类器及高频词去除函数         ##############
def calMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.iteritems(), key = operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]

def localWords(feed1,feed0):
    import feedparser
    docList=[];classList=[];fullText=[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calMostFreq(vocabList,fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:vocabList.remove(piarW[0])
    trainingSet= range(2*minLen);testSet=[]
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[];trainClasses=[]
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is :',float(errorCount)/len(testSet)
    return vocabList,p0V,p1V

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值