# coding:utf-8
import urllib2
import time
from lxml import etree
from bs4 import BeautifulSoup
import requests
import types
import sys
import json
from Queue import Queue
import threading
CRAWL_EXIT = False
PARSE_EXIT = False
class ThreadCrawl(threading.Thread):
def __init__(self, threadName, pageQueue, dataQueue):
# threading.Thread.__init__(self)
super(ThreadCrawl, self).__init__()
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
def run(self):
print "启动 " + self.threadName
while not CRAWL_EXIT:
try:
page = self.pageQueue.get(False)
url = "https://tieba.baidu.com/f?kw=%E5%A6%B9%E5%AD%90&ie=utf-8&pn=" + str(page * 50)
content = requests.get(url, headers=self.headers).text
time.sleep(1)
self.dataQueue.put(content)
except:
pass
print "结束 " + self.threadName
class ThreadParse(threading.Thread):
def __init__(self, threadName, dataQueue, filename, lock):
super(ThreadParse, self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.fileName = filename
self.lock = lock
def run(self):
print "启动" + self.threadName
while not PARSE_EXIT:
try:
html = self.dataQueue.get(False)
self.parse(html)
except:
pass
print "退出" + self.threadName
def parse(self, html):
html = etree.HTML(html)
nodeList = html.xpath('//*[@id="thread_list"]//li/div/div[2]/div[1]/div[1]/a')
for title in nodeList:
items = {
"title" : title.text
}
with self.lock:
self.fileName.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + "\n")
def main():
pageQueue = Queue(10)
for i in range(1, 11):
pageQueue.put(i)
dataQueue = Queue()
filename = open("duanzi.json", "a")
lock = threading.Lock()
# 三个采集线程的名字
crawlList = ["采集线程1号", "采集线程2号", "采集线程3号"]
# 存储三个采集线程的列表集合
threadcrawl = []
for threadName in crawlList:
thread = ThreadCrawl(threadName, pageQueue, dataQueue)
thread.start()
threadcrawl.append(thread)
# 三个解析线程的名字
parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]
# 存储三个解析线程
threadparse = []
for threadName in parseList:
thread = ThreadParse(threadName, dataQueue, filename, lock)
thread.start()
threadparse.append(thread)
while not pageQueue.empty():
pass
# 如果pageQueue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
print "pageQueue is empty"
for thread in threadcrawl:
thread.join()
print "1"
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in threadparse:
thread.join()
print "2"
with lock:
# 关闭文件
filename.close()
print "谢谢使用!"
if __name__ == "__main__":
main()
python---多线程采集示例
最新推荐文章于 2024-07-16 11:18:32 发布