python 热词图_python抓取百度热词

本文介绍了如何使用Python从百度热词排行榜获取数据,并通过解析HTML去除标签,提取关键词。使用多线程进行并发处理,提高效率。最后将获取的热词写入文件,为绘制热词图做准备。
摘要由CSDN通过智能技术生成

#coding:utf-8

import urllib,urllib2

import re

import threading

from urllib import quote

import HTMLParser

wordList = set()

wordsList = set()

def strip_tags(html):

html = html.strip()

html = html.strip("\n")

result = []

parse = HTMLParser.HTMLParser()

parse.handle_data = result.append

parse.feed(html)

parse.close()

return "".join(result)

def getBaiduTopWords():

url = "http://top.baidu.com/boards"

webcontent = urllib.urlopen(url).read()

idList = re.findall('href="\./buzz\?b=(\d+)?"',webcontent)

idSet = set(idList)

for i in range(10):

print "Thread %s Start..."%(i+1)

t = threading.Thread(target=doGetWords,args=(idSet,))

t.start()

t.join(1)

def doGetWords(idSet):

while len(idSet) > 0:

print "idSet length ",len(idSet)

try:

uid = idSet.pop()

print "id = ",uid

url = 'http://top.baidu.com/buzz?b='+uid

webcontent = urllib.urlopen(url).read()

words = re.findall('(.*?)',webcontent)

print len(words),"words found."

wordList.update(words)

except Exception,e:

print e

#retry

idSet.add(uid)

continue

print "threading activeCount",threading.activeCount()

if threading.activeCount() == 2:

print len(wordList),"words found in total."

print wordList

print threading.enumerate()

for i in range(10):

print "Thread %s Start..."%(i+1)

t = threading.Thread(target=getWords,args=())

t.start()

t.join(1)

def getWords():

while len(wordList) > 0:

try:

keyword = wordList.pop()

req = urllib2.Request("http://tool.chinaz.com/baidu/words.aspx")

req.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3)")

req.add_header("Referer","http://tool.chinaz.com/baidu/words.aspx")

data = "kw="+quote(keyword.decode("gbk").encode("utf8"))

print data

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())

response = opener.open(req, data)

html = response.read()

kw = re.findall('

(.*?)',html)

kw = map(strip_tags,kw)

print len(kw)

if len(kw) == 0:

wordsList.add(keyword)

else:

wordsList.update(kw)

except:

wordList.add(keyword)

print "getWords threading activeCount",threading.activeCount()

f = open("words.inc", "w")

f.writelines(wordsList)

f.close()

getBaiduTopWords()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值