朴素贝叶斯

在调试过程中发现,朴素贝叶斯算法对于训练数据是比较敏感的,个人理解,基本上是通过输入的词条向量乘以类别向量并请求和,与词条向量乘以其他类别的词条向量并求和  之间一个概率值大小的比较。

实现的功能

1、对词条进行侮辱性留言检测

2、实现垃圾邮件的判断

navieBray.py

from numpy import *
import feedparser

class naviebray(object):
    def __init__(self):
        pass
    
    def loadDataSet(self):
        '''
        postingList: 进行词条切分后的文档集合
        classVec:类别标签    
        '''
        postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                     ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                     ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                     ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                     ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                     ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        classVec = [0,1,0,1,0,1]    #1代表侮辱性文字,0代表正常言论
        return postingList,classVec
    
    def createVocabList(self,dataSet):
        vocabSet = set([])#使用set创建不重复词表库
        for document in dataSet:
            vocabSet = vocabSet | set(document) #创建两个集合的并集
        return list(vocabSet)
    
    def setOfWords2Vec(self,vocabList, inputSet):
        returnVec = [0]*len(vocabList)#创建一个所包含元素都为0的向量
        #遍历文档中的所有单词,如果出现了词汇表中的单词,则将输出的文档向量中的对应值设为1
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] = 1
            else: print("the word: %s is not in my Vocabulary!" % word)
        return returnVec
    '''
    我们将每个词的出现与否作为一个特征,这可以被描述为词集模型(set-of-words model)。
    如果一个词在文档中出现不止一次,这可能意味着包含该词是否出现在文档中所不能表达的某种信息,
    这种方法被称为词袋模型(bag-of-words model)。
    在词袋中,每个单词可以出现多次,而在词集中,每个词只能出现一次。
    为适应词袋模型,需要对函数setOfWords2Vec稍加修改,修改后的函数称为bagOfWords2VecMN
    '''
    def bagOfWords2VecMN(self,vocabList, inputSet):
        returnVec = [0]*len(vocabList)
        for word in inputSet:
            if word in vocabList:
                returnVec[vocabList.index(word)] += 1
        return returnVec
    
    def trainNB0(self,trainMatrix,trainCategory):
        '''
        朴素贝叶斯分类器训练函数(此处仅处理两类分类问题)
        trainMatrix:文档矩阵
        trainCategory:每篇文档类别标签
        '''
        numTrainDocs = len(trainMatrix) #6
        numWords = len(trainMatrix[0])  #32
        
        pAbusive = sum(trainCategory)/float(numTrainDocs) #计算属于侮辱性文档的概率
    
        #初始化所有词出现数为1,并将分母初始化为2,避免某一个概率值为0
        p0Num = ones(numWords); p1Num = ones(numWords)#
        p0Denom = 2.0; p1Denom = 2.0 #
        for i in range(numTrainDocs):
            if trainCategory[i] == 1:
                p1Num += trainMatrix[i]
                p1Denom += sum(trainMatrix[i])
            else:
                p0Num += trainMatrix[i]
                p0Denom += sum(trainMatrix[i])
        #将结果取自然对数,避免下溢出,即太多很小的数相乘造成的影响
    
        p1Vect = log(p1Num/p1Denom)#change to log()
        p0Vect = log(p0Num/p0Denom)#change to log()
        return p0Vect,p1Vect,pAbusive
    
    def classifyNB(self,vec2Classify, p0Vec, p1Vec, pClass1):
        '''
        分类函数
        vec2Classify:要分类的向量
        p0Vec, p1Vec, pClass1:分别对应trainNB0计算得到的3个概率
        '''
        p1 = sum(vec2Classify * p1Vec) + log(pClass1)
        p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
        if p1 > p0:
            return 1
        else: 
            return 0
    
    def testingNB(self):
        listOPosts,listClasses = self.loadDataSet()
        myVocabList = self.createVocabList(listOPosts)
        trainMat=[]
        for postinDoc in listOPosts:
            trainMat.append(self.setOfWords2Vec(myVocabList, postinDoc))
            
        #训练模型,注意此处使用array
        p0V,p1V,pAb = self.trainNB0(array(trainMat),array(listClasses))
        
        testEntry = ['love', 'my', 'dalmation']
        thisDoc = array(self.setOfWords2Vec(myVocabList, testEntry))
        print(testEntry,'classified as: ',self.classifyNB(thisDoc,p0V,p1V,pAb))
        testEntry = ['stupid', 'garbage']
        thisDoc = array(self.setOfWords2Vec(myVocabList, testEntry))
        print(testEntry,'classified as: ',self.classifyNB(thisDoc,p0V,p1V,pAb))
    
    def textParse(self,bigString):#
        '''
        文本切分
        输入文本字符串,输出词表
        '''
        import re
        listOfTokens = re.split(r'\W*', bigString)
        return [tok.lower() for tok in listOfTokens if len(tok) > 2]
    
    def spamTest(self):
        '''
        垃圾邮件测试函数
        '''
        docList=[]; classList = []; fullText =[]
        for i in range(1,26):
    
            #读取垃圾邮件
            wordList = self.textParse(open('email/spam/%d.txt' % i,'r',encoding= 'utf-8').read())
            docList.append(wordList)
            fullText.extend(wordList)
            #设置垃圾邮件类标签为1
            classList.append(1)        
            
            wordList = self.textParse(open('email/ham/%d.txt' % i,'r',encoding= 'utf-8').read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
        vocabList = self.createVocabList(docList)#生成词表库
        trainingSet = list(range(50))
        testSet=[]           #
        #随机选10组做测试集
        for i in range(10):
            randIndex = int(random.uniform(0,len(trainingSet)))
            testSet.append(trainingSet[randIndex])
            del(trainingSet[randIndex])  
        trainMat=[]; trainClasses = []
        for docIndex in trainingSet:#生成训练矩阵及标签
            trainMat.append(self.bagOfWords2VecMN(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V,p1V,pSpam = self.trainNB0(array(trainMat),array(trainClasses))
        errorCount = 0
        #测试并计算错误率
        for docIndex in testSet:
            wordVector = self.bagOfWords2VecMN(vocabList, docList[docIndex])
            if self.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
                errorCount += 1
                print("classification error",docList[docIndex])
        print('the error rate is: ',float(errorCount)/len(testSet))
        return vocabList,fullText

if __name__ == '__main__':
    
    nb = naviebray()
    nb.testingNB()
#     spamTest()

testingNB()输出:

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1

spamTest()输出:

这个输出每次都会不一样 大概维持在0.0~0.4之间

classification error ['yeah', 'ready', 'may', 'not', 'here', 'because', 'jar', 'jar', 'has', 'plane', 'tickets', 'germany', 'for']
classification error ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts']
the error rate is:  0.2



D:\F\Anaconda3\lib\re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)
classification error ['yeah', 'ready', 'may', 'not', 'here', 'because', 'jar', 'jar', 'has', 'plane', 'tickets', 'germany', 'for']
the error rate is:  0.1

 

通过导入RSS源对数据分析,由于源已经失效,没法验证,先把程序贴上来

'''
函数localWords()与程序清单中的spamTest()函数几乎相同,区别在于这里访问的是
RSS源而不是文件。然后调用函数calcMostFreq()来获得排序最高的30个单词并随后将它们移除
'''
def localWords(feed1,feed0):
     
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen)); testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V
 
def calcMostFreq(vocabList,fullText):
    '''
    返回前30个高频词
    '''
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token]=fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) 
    return sortedFreq[:30]
 
if __name__== "__main__":  
    #导入RSS数据源
    import operator
    ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
    sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
 
    localWords(ny,sf)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值