多线程爬取糗事百科
import threading
import requests
from Queue import Queue
from lxml import etree
import json
import time
CRAWL_EXIT = False
PARSE_EXIT = False
class ThreadCrawl(threading.Thread):
def __int__(self , threadName , pageQueue , dataQueue):
super(ThreadCrawl , self).__init__()
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
def run(self):
print "启动" + self.threadName
while not CRAWL_EXIT:
try:
#取出一个数字,先进先出
#1.如果队列为空,block为True的话,不会结束,会进入阻塞状态,直到队列有新的数据
#2.如果队列为空,block为False的话,就会弹出一个Queue.empty()异常
page = self.pageQueue.get(False)
url = "http://www.qiushibaike.com/8hr/page/" + str(page) +"/"
#获取页面内容
content = request.gets(url , headers = self.headers).text
time.sleep(1)
self.dataQueue.put(content)
except:
pass
print "结束" + self.threadName
#解析数据线程类
class ThreadParse(threading.Thread):
def __init__(self , threadName , dataQueue , filename , lock):
super(ThreadParse , self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.filename = filename
self.lock = lock
def run(self):
print "启动" + self.threadName
while not PARSE_EXIT:
try:
html = self.dataQueue.get(False)
self.parse(html)
except:
pass
print "退出" + self.threadName
def parse(self , html):
html = etree.HTML(html)
node_list = html.xpath('//div[contains(@id, "qiushi_tag")]')
for node in node_list:
username = node.xpath('./div/a/@title')[0]
image = node.xpath('.//div[@class="thumb"]//@src')
content = node.xpath('.//div[@class="content"]/span')[0].text
zan = node.xpath('.//i')[0].text
comments = node.xpath('.//i')[1].text
items = {
"username" : username,
"image" : image,
"content" : content,
"zan" : zan,
"comments" : comments
}
with self.lock:
self.filename.write(json.dumps(items , ensure_ascii = False).encode("utf-8") )
def main():
pageQueue = Queue(20)
for i in range(1 , 21):
pageQueue.put(i)
#采集结果的数据队列,参数为空表示不限制
dataQueue = Queue()
filename = open("duanzi.json" , "a")
lock = threading.Lock()
crawList = ["采集线程1号" , "采集线程2号" , "采集线程3号"]
#存储三个采集线程的列表集合
threadcrawl = []
for threadName in crawlList:
thread = ThreadCrawl(threadName , pageQueue , dataQueue)
thread.start()
threadcrawl.append(thread)
parseList = ["解析线程1号" , "解析线程2号" , "解析线程3号"]
#存储3个解析线程
threadparse = []
for threadName int parseList:
thread = ThreadParse(threadName , dataQueue , filename , lock)
thread.start()
threadparse.append(thread)
while not pageQueue.empty():
pass
global CRAWL_EXIT
CRAWL_EXIT = True
print "pageQueue为空"
for thread in threadcrawl:
thread.join()
print "1"
while not dataQueue.empty();
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in threadparse:
thread.join()
print "2"
with lock:
filename.close()
print "谢谢使用!"
if __name__ = "__main__":
main()