# -*- coding:utf-8 -*- import sys reload(sys) sys.setdefaultencoding("utf-8") import requests from Queue import Queue from lxml import etree import threading from time import sleep import codecs file_handle = codecs.open("result.txt", "w", encoding="utf-8") class DownloadThread(threading.Thread): def __init__(self, thread_name, queue): super(DownloadThread, self).__init__() self.thread_name = thread_name self.queue = queue def run(self): while True: if self.queue.empty(): file_handle.write(self.thread_name + "已经下班" + "\n") break page = self.queue.get() file_handle.write(self.thread_name + "准备下载" + str(page)+"\n") url = "http://blog.jobbole.com/all-posts/page/ " + str(page) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0", } response = requests.get(url=url, headers=headers) if response and response.status_code == 200: content = response.content parse_content_queue.put(content) else: file_handle.write(self.thread_name + "网页下载失败" + url + "\n") file_handle.write(self.thread_name + "下载完成" + str(page)+"\n") class ParseThread(threading.Thread): def __init__(self, thread_name): super(ParseThread, self).__init__() self.thread_name = thread_name def run(self): global parse_switch while parse_switch: file_handle.write("--------------------------" + "\n") content = parse_content_queue.get() doc = etree.HTML(content) file_handle.write(self.thread_name + "开始解析" + "\n") img_list = doc.xpath("//img/@src") for img in img_list: file_handle.write(img + "\n") file_handle.write(self.thread_name + "结束解析" + "\n") if __name__ == '__main__': download_queue = Queue(maxsize=30) parse_content_queue = Queue() parse_switch = True for page in range(1, download_queue.maxsize + 1): download_queue.put(page) download_thread_names = [ "下载器1", "下载器2", "下载器3", "下载器4", ] download_thread_list = [] for thread_name in download_thread_names: d = DownloadThread(thread_name, download_queue) d.start() download_thread_list.append(d) while not download_queue.empty(): pass for thread in download_thread_list: thread.join() parse_thread_names = [ "解析器1", "解析器2", "解析器3", ] parse_thread_list = [] for thread_name in parse_thread_names: p = ParseThread(thread_name) p.start() parse_thread_list.append(p) while not parse_content_queue.empty(): pass parse_switch = False for thread in parse_thread_list: thread.join()
多线程实际应用
最新推荐文章于 2024-04-05 17:30:19 发布