朴素贝叶斯原理与应用(三)

使用朴素贝叶斯分类器从个人广告中获取区域倾向

1.1获取高频词
def caculateMostFreq(vocalList, fullText):
    """

    :param vocalList:
    :param fullText:
    :return: 返回高频词
    """
    mostFreq = {}
    import operator
    for token in vocalList:
        mostFreq[token] = fullText.count(token)
    sortedMostFreq = sorted(mostFreq.items(),key=operator.itemgetter(1),reverse=True)
    return sortedMostFreq[:30]
1.2 下载安装feedparser
pip install feedparser
1.3 访问RSS,获取数据源
def localWords(feed1,feed0):
    #
    docList = []; classList = []; fullText = []
    minLen = min(len(feed0['entries']), len(feed1['entries']))
    for i in range(minLen):
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        classList.append(1)
        fullText.append(wordList)
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        classList.append(0)
        fullText.append(wordList)
    vocalList = createVocabList(docList)
    top30Words = caculateMostFreq(vocalList,fullText)
    for w in top30Words:
        if w[0] in vocalList:
            vocalList.remove(w[0])
    trainningSet = range(2*minLen); testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainningSet)))
        testSet.append(trainningSet[randIndex])
        del (list(trainningSet)[randIndex])
    trainMat = []; trainClass = []
    for docIndex in trainningSet:
        trainMat.append(bagOfWords2VecMN(vocalList, docList[docIndex]))
        trainClass.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClass))
    errorCount = 0
    # 对测试集分类
    for docIndex in testSet:
        wordVector = (bagOfWords2VecMN(vocalList,docList[docIndex]))
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is ', float(errorCount/len(testSet)))
    return vocalList, p0V, p1V
1.4 分析数据:显示地域相关的用词
def getTopWord(ny,sf):
    import operator
    vocalList, p0V, p1V = localWords(ny,sf)
    pNY = []; pSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0:
            pSF.append((vocalList[i], p0V[i]))
        if p1V[i] > -6.0:
            pNY.append((vocalList[i], p1V[i]))
    sortedpSF = sorted(pSF,key=lambda p: p[1],reverse=True)
    print('*SF*SF*SF*SF*S*FS*FS*SF')
    for item in sortedpSF:
        print(item[0])
    sortedpNY = sorted(pNY,key=lambda p: p[1],reverse=True)
    print('NY*NY*NY*NY')
    for item in sortedpNY:
        print(item[0])
getTopWord(ny,sf)

其中,

ny = feedparser.parse('https://newyork.craigslist.org/search/res?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/search/apa?format=rss')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值