#coding:utf-8
import threading
import re
import urllib
from collections import deque
def encode(key):
return key.decode("gbk").encode("utf-8")
def getHotWord():
while True:
try:
url = QUEUE.popleft()
print url
webcontent = urllib.urlopen(url).read()
txts = re.findall('<a class="list-title".*?">(.*?)</a>',webcontent)
WORD_LIST.extend(map(encode, txts))
except Exception,e:
print e
if str(e) == "pop from an empty deque":
break
else:
QUEUE.extend(url)
f = open("KeyWords.inc",'w')
print "-->",len(WORD_LIST)
f.writelines("\n".join(WORD_LIST))
f.close()
THREAD_NUM = 5
URL_LIST = ['http://top.baidu.com/buzz?b=26&c=1',
'http://top.baidu.com/buzz?b=659&c=1',
'http://top.baidu.com/buzz?b=338&c=1',
'http://top.baidu.com/buzz?b=340&c=1',
'http://top.baidu.com/buzz?b=339&c=1',
'http://top.baidu.com/buzz?b=337&c=1',
'http://top.baidu.com/buzz?b=437&c=1',
]
QUEUE = deque(URL_LIST)
WORD_LIST = deque([])
for i in range(THREAD_NUM):
print "Thread %s Start..."%(i+1)
t = threading.Thread(target=getHotWord,args=())
t.start()
t.join(1)
多线程抓取热词
最新推荐文章于 2021-12-06 09:38:54 发布