机器学习实战(3) ——基于概率论的分类方法:朴素贝叶斯(python实现)

这是学习机器学习算法实战这本书时,写的代码实战。让自己对各个算法有更直观的了解,不能一直不写啊。不管简单还是不简单都亲自一行一行的敲一遍啊。

具体的源码和和数据链接:https://pan.baidu.com/s/1G2S2pb5gfBnxGNNTFgTkEA 密码:fov0

下面是主程序bayes.py和其中在实际操作中遇到的问题啊

# -*- coding: utf-8 -*-
# author: Yufeng Song
from numpy import *
import re

import feedparser


def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]
    return postingList, classVec


def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)


# 这是词集模型,只判断词是否出现所以只为1
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print('the word:%s is not in my Vocabulary!' % word)
    return returnVec


# 这是词袋模型,所以判断的次数不止为1
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec


# 这个函数就是计算当是侮辱性文档中每个词出现的概率和非侮辱性文档时每个词出现的概率

def trainNBO(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    # print(numTrainDocs)  # 6
    # print(trainCategory)  # [0, 1, 0, 1, 0, 1]
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    # p0Num = zeros(numWords);p1Num = zeros(numWords)
    # p0Denom = 0.0; p1Denom = 0.0
    p0Num = ones(numWords);
    p1Num = ones(numWords);
    p0Denom = 2.0;
    p1Denom = 2.0  # 防止概率为0的情况
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:  # 是侮辱性文档的情况
            p1Num += trainMatrix[i]  # 对应的词的位置会加一
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # print('#' * 50)
    # print(p1Num)
    # print(p1Denom)
    p1Vect = log(p1Num / p1Denom)
    p0Vect = log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive  # p0Vect不是侮辱性文档时每个词出现的概率,p1Vect是侮辱性文档时每个词出现的概率


def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    # if p1 > p0:
    #     return 1
    # else:
    #     return 0
    return 1 if p1 > p0 else 0


def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNBO(array(trainMat), array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb))


def textParse(bigString):
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]


def spamTest():
    docList = [];
    classList = [];
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        if i == 1: print(wordList)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i,encoding='gb18030',errors='ignore').read())#open(‘1.txt’,encoding=’gb18030’,errors=‘ignore’)
        # wordList = textParse(open('email/ham/%d.txt' % i).read().decode('utf-8'))
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    # print('#'*100)
    # print(vocabList)
    trainingSet =list(range(50)); #python3.x   range返回的是range对象,不返回数组对象
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))#返回[x,y)间的一个随机数
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = [];
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSam = trainNBO(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is:', float(errorCount) / len(testSet))

def calcMostFreq(vocabList,fullText):
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(),key = lambda x:x[1],reverse=True)
    return sortedFreq[:30]

def localWords(feed1,feed0):
    docList=[];classList = [];fullText = []
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet =list(range(2*minLen)); testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNBO(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is:',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V

def getTopWords(ny,sf):
    vocabList,p0V,p1V = localWords(ny,sf)
    topNY = [];topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p0V[i]))
    sortedSF = sorted(topSF,key=lambda x:x[1],reverse=True)
    print(sortedSF)
    print("SF**"*10)
    # for item in sortedSF:
    #     print(item[0])
    for i in range(10):
        print(sortedSF[i][0],end=" ")
    sortedNY = sorted(topNY,key=lambda x:x[1],reverse=True)
    print(sortedNY)
    print('NF**'*10)
    # for item in sortedNY[10]:
    #     print(item[0])
    for i in range(10):
        print(sortedNY[i][0],end='\t')




if __name__ == '__main__':

    # listOPosts, listClasses = loadDataSet()
    # myVocabList = createVocabList(listOPosts)
    # print(listOPosts)
    # print(myVocabList)
    # print(setOfWords2Vec(myVocabList, listOPosts[0]))
    # # [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
    # print(setOfWords2Vec(myVocabList, listOPosts[3]))
    # trainMat = []
    # for postinDoc in listOPosts:
    #     trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    #
    # p0V, p1V, pAb = trainNBO(trainMat, listClasses)
    # print(p0V)
    # print(p1V)
    # print(pAb)
    # testingNB()
    # emailText = open('email/ham/6.txt').read()
    # print(emailText)
    # regEx = re.compile('\\W*')
    # listOfTokens = regEx.split(emailText)
    # print(listOfTokens)
    # spamTest()
    ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')#解析一个网站内容f
    sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
    # print(ny)
    # print(len(ny['entries']))#25
    # vocabList,pSF,pNY = localWords(ny,sf)
    #
    # vocabList,pSF,pNY = localWords(ny,sf)
    getTopWords(ny,sf)



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值