使用朴素贝叶斯分类器从个人广告中获取区域倾向
1.1获取高频词
def caculateMostFreq(vocalList, fullText):
"""
:param vocalList:
:param fullText:
:return: 返回高频词
"""
mostFreq = {}
import operator
for token in vocalList:
mostFreq[token] = fullText.count(token)
sortedMostFreq = sorted(mostFreq.items(),key=operator.itemgetter(1),reverse=True)
return sortedMostFreq[:30]
1.2 下载安装feedparser
pip install feedparser
1.3 访问RSS,获取数据源
def localWords(feed1,feed0):
#
docList = []; classList = []; fullText = []
minLen = min(len(feed0['entries']), len(feed1['entries']))
for i in range(minLen):
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
classList.append(1)
fullText.append(wordList)
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
classList.append(0)
fullText.append(wordList)
vocalList = createVocabList(docList)
top30Words = caculateMostFreq(vocalList,fullText)
for w in top30Words:
if w[0] in vocalList:
vocalList.remove(w[0])
trainningSet = range(2*minLen); testSet = []
for i in range(20):
randIndex = int(random.uniform(0,len(trainningSet)))
testSet.append(trainningSet[randIndex])
del (list(trainningSet)[randIndex])
trainMat = []; trainClass = []
for docIndex in trainningSet:
trainMat.append(bagOfWords2VecMN(vocalList, docList[docIndex]))
trainClass.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClass))
errorCount = 0
# 对测试集分类
for docIndex in testSet:
wordVector = (bagOfWords2VecMN(vocalList,docList[docIndex]))
if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is ', float(errorCount/len(testSet)))
return vocalList, p0V, p1V
1.4 分析数据:显示地域相关的用词
def getTopWord(ny,sf):
import operator
vocalList, p0V, p1V = localWords(ny,sf)
pNY = []; pSF = []
for i in range(len(p0V)):
if p0V[i] > -6.0:
pSF.append((vocalList[i], p0V[i]))
if p1V[i] > -6.0:
pNY.append((vocalList[i], p1V[i]))
sortedpSF = sorted(pSF,key=lambda p: p[1],reverse=True)
print('*SF*SF*SF*SF*S*FS*FS*SF')
for item in sortedpSF:
print(item[0])
sortedpNY = sorted(pNY,key=lambda p: p[1],reverse=True)
print('NY*NY*NY*NY')
for item in sortedpNY:
print(item[0])
getTopWord(ny,sf)
其中,
ny = feedparser.parse('https://newyork.craigslist.org/search/res?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/search/apa?format=rss')