#coding=utf-8 import feedparser import bayes import bayes_email from numpy import * #计算出现频率最高三十个单词 def calMostFreq(vocabList,fullText) : import operator freqDict={} for token in vocabList: freqDict[token]=fullText.count(token) sortedFreq=sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True) return sortedFreq[:30] def localwords(feed1,feed0): docList=[] classList=[] fullText=[] minLen=min(len(feed1['entries']),len(feed0['entries']) ) for i in range(minLen): wordList=bayes_email.textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList=bayes_email.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList=bayes.createVocabList(docList) top30Words=calMostFreq(vocabList,fullText) for pairW in top30Words: if pairW[0] in vocabList :vocabList.remove(pairW[0]) trainingSet=range(2*minLen) testSet=[] for i in range(20): randIndex=int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat=[];trainClasses=[] for docIndex in trainingSet: trainMat.append(bayes.bagOfWords2VecMN(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam=bayes.trainNB0(trainMat,trainClasses) errorCount=0 errorData=[] for docIndex in testSet: wordVector=bayes.setOfWords2Vec(vocabList,docList[docIndex]) if bayes.classifyNB(wordVector,p0V,p1V,pSpam) != classList[docIndex]: errorCount+=1 errorData.extend(docList[docIndex]) print 'the error rate is :',float(errorCount)/len(testSet) print 'the error data is :',errorData return vocabList,p0V,p1V def getTopWords(ny,sf): import operator vocabList,p0V,p1V=localwords(ny,sf) topNY=[];topSF=[] for i in range(len(p0V)): if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i])) if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i])) sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True) print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**" for item in sortedSF: print(item[0]) sortedNY=sorted(topNY,key=lambda pair:pair[1],reverse=True) print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**" for item in sortedNY: print item[0] ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') getTopWords(ny,sf)
朴素贝叶斯算法学习笔记(三)显示地域相关的用词
最新推荐文章于 2024-08-09 00:05:38 发布