机器学习实战第四章朴素贝叶斯算法照葫芦画瓢算法实践

本文链接：https://blog.csdn.net/u011481752/article/details/70544910
完成了朴素贝叶斯分类器的主要算法，并且利用其过滤了辣鸡邮件，及对个人发布的大量广告中学习分类器，并将学习结果转换成可以理解的信息。
用到了feedparse库中相关的函数来访问RSS源，如果是在windos下，且装有anaconda的情况下，可以不需要去官网上下包，解压再安装，直接在命令行中
输入conda install feedparse一般就能安装成功，非常方便。
# -*- coding: utf-8 -*-
"""
照葫芦画瓢完成于2017.4.23 20:25
算法名称 : 基于朴素贝叶斯的分类方法
算法整体思路：
  通过统计想要的不同类型的数据出现的频率，转换成概率，依照条件概率进行具体的分类，主要对于一些相关文本的属性进行分类。
  1.从文本中构建词向量
  2.通过构建出的词向量计算概率
  3.构建文件词袋模型
  4.切分文本，解析文本，构建训练集，利用朴素贝叶斯对测试集中的文本进行分类
作者:
    zzt941006
"""
from numpy import *
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]#进行词条切分后的文档集合
    classVec = [0,1,0,1,0,1]    #1 代表侮辱性文字 0代表正常言论,也即每一行里的文字代表的是侮辱性的还是正常向的
    return postingList,classVec
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet: # 每次取出list中的一行
        vocabSet = vocabSet | set(document) #去重取并集
        #print document,len(vocabSet)
    return list(vocabSet)#得到新的不重复的单词表
def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print "the word: %s is not in my Vocabulary!" % word
    return returnVec
#训练算法通过词向量计算概率
#对于for循环里的计算，我们可以发现，比如my这个词，在分类0里面出现3次，则最后的p0Num数组所对应的my那个位置上的值就是3，而p0Denom代表
#所有非侮辱性文档中的单词总数，比如第一篇有7个第三篇有8个第五篇有9个一共出现了24次，故p(my|非侮辱性文档) = 3 / 24 =1 / 8
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)#共有多少个文档，即文档的行数，根据输入数据的话是6行
    numWords = len(trainMatrix[0])#单词表的长度，基于总单词表构成的文档向量，其每一行的元素个数，去重之后的单词表长度为32故大小为32
    pAbusive = sum(trainCategory) / float(numTrainDocs)#侮辱性文档的概率
   # print numTrainDocs,numWords,pAbusive
    p0Num = ones(numWords)
    p1Num = ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]#记录所有分类为侮辱性文档中的每个词汇一共出现了多少次
            p1Denom +=sum(trainMatrix[i])#记录第i篇侮辱性文档中共有多少个单词出现
            #print i,trainCategory[i],trainMatrix[i],sum(trainMatrix[i])
        else:
             p0Num += trainMatrix[i]#记录所有分类为非侮辱性文档中的每个词汇一共出现了多少次
             p0Denom +=sum(trainMatrix[i])#记录第i篇非侮辱性文档中共有多少个单词出现
            # print i,trainCategory[i],trainMatrix[i],sum(trainMatrix[i])
    p1Vect = log(p1Num / p1Denom) #在所有侮辱性文档中，每个单词出现的概率，取个log
    p0Vect = log(p0Num / p0Denom) #在所有非侮辱性文档中，每个单词出现的概率,取个log
    
    return p0Vect,p1Vect,pAbusive
#注意一下这里概率的计算。p1 = p (ci = 1 | w) = p(w|ci = 1) * p(ci = 1) / p(w)
#而p0的计算也要除以p(w)忽略分母，只管分子，即只考虑 p(w | ci = 1) * p(ci = 1)和 p(w|ci = 0) * p(ci = 0)的大小即可以知道属于哪一类
#那么此时取个log防止过小，则有log(p(w | ci = 1) * p(ci = 1)) = Σlog(p(wj | ci = 1)) + log(p(ci = 1))
#log(p(w | ci = 0) * p(ci = 0)) = Σlog(p(wj | ci = 0)) + log(p(ci = 0)) = Σlog(p(wj | ci = 0)) + log(1-p(ci = 1))
#从而对应了下面这个分类函数的p1和p0的计算方式，又因为在train中，已经取了log 所以求和只要单纯的向量相乘求和即可
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1-pClass1)
    if p1 > p0:
        return 1
    if p0 > p1:
        return 0
#构建文档词袋模型，跟最开始的setOfWords2Vec只有一个区别，那就是在词袋中，每个单词可以出现多次，set中只是记录有多少个不同的单词。
#词袋模型中开一个长度为单词表总长度的数组，并初始化为全0，然后读所有的输入进来，统计输入的向量每个单词出现的次数。
#一句话概括就是set返回的是一个01矩阵，只有2种不同的元素，而bag里面的元素不止 0 和 1 两种取值
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec
#测试分类的代码，输入几个单词，然后生成其对应的单词表下的向量，然后通过分类函数来测试它属于侮辱性文档还是非侮辱性文档
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print thisDoc,testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
    testEntry = ['stupid', 'love','my','to','cute','please']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print thisDoc,testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)
#接受一个大字符串，并将其解析为字符串列表
def textPrase(bigString):
    import re
    listOfTokens = re.split(r'W*',bigString)#对字符串进行处理，按空格切分，并且去除标点符号
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] #去除少于两个字符的字符串，并全部转为小写
#文件解析及完整的垃圾邮件测试函数，并使用朴素贝叶斯进行交叉验证
def spamTest():
    docList =[]#将这个脑补成loadDataSet里的postingList
    classList = []#对应每个docList每行的分类
    fullText = []#构建大单词表（不去重）
    for i in range(1,26):#分别读取25个H的txt和SP的txt，实际上就是导入并解析文件
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)     
    vocabList = createVocabList(docList)#把25个H的txt和SP的txt先加入docList中，然后去重构造出大单词表    
   #注意一下这里的分类是1和0间隔出现的
    trainingSet = range(50)# 0 - 49
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))#取不重复的10个数字作为测试集的下标，故训练集只有40个
        testSet.append(trainingSet[randIndex])#加入测试集
        del(trainingSet[randIndex])#删除对应的下标
    #print testSet
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:#注意这里的 trainingSet只有40个元素了
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))#构造训练集矩阵，一堆01矩阵
        trainClasses.append(classList[docIndex])#记录对应下标的真实对应的分类
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))#进行训练，获取相应概率
    errorCount = 0
    for docIndex in testSet:    #遍历测试集    
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])#将测试集里的单词丢进词袋模型中获取相应矩阵
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:#分类结果与正常结果进行比对，看是否对应
            errorCount += 1
            print "classification error",docList[docIndex]
    print 'the error rate is: ',float(errorCount)/len(testSet)
#统计出现次数最多的前30个单词
def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.iteritems(),key = operator.itemgetter(1),reverse = True)
    return sortedFreq[:30]
#feed1 feed0为两个RSS源，说明在这里访问的是RSS源，而并非一成不变的文件其中要加入去除词频最高的30个单词的预处理，
#并返回相关的单词表和对应的概率，其余跟spam()非常类似，通过移除最高词频的单词这一预处理，可以提高准确率
def localWords(feed1,feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)
    #print top30Words
  #  print len(vocabList),vocabList
    for pairW in top30Words:
       # print len(pairW),pairW[0],pairW[1]
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = range(2*minLen)
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))#取不重复的20个数字作为测试集的下标
        testSet.append(trainingSet[randIndex])#加入测试集
        del(trainingSet[randIndex])#删除对应的下标
    #print testSet
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))#构造训练集矩阵一堆01矩阵
        trainClasses.append(classList[docIndex])#记录对应下标的真实对应的分类
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
           # print "classification error",docList[docIndex]
    print 'the error rate is: ',float(errorCount)/len(testSet)
    return vocabList,p0V,p1V
#将两个RSS源作为输入，通过训练后再利用朴素贝叶斯分类，返回排名最具代表性的词汇，可以设定一个阈值，来获取这些分类之后的词汇，并按照该词汇出现
#的条件概率的高低，进行排序输出
def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
    for item in sortedSF:
        print item[0]
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"
    for item in sortedNY:
        print item[0]
    print "VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**VC**"
    print vocabList