多线程实际应用

最新推荐文章于 2024-04-05 17:30:19 发布

han_yanlong

最新推荐文章于 2024-04-05 17:30:19 发布

阅读量338

点赞数

文章标签： python 爬虫 xpath 多线程

本文链接：https://blog.csdn.net/han_yanlong/article/details/76998653

版权

# -*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import requests
from Queue import Queue
from lxml import etree
import threading
from time import sleep
import codecs

file_handle = codecs.open("result.txt", "w", encoding="utf-8")


class DownloadThread(threading.Thread):
    def __init__(self, thread_name, queue):
        super(DownloadThread, self).__init__()
        self.thread_name = thread_name
        self.queue = queue

    def run(self):
        while True:
            if self.queue.empty():
                file_handle.write(self.thread_name + "已经下班" + "\n")
                break
            page = self.queue.get()
            file_handle.write(self.thread_name + "准备下载" + str(page)+"\n")

            url = "http://blog.jobbole.com/all-posts/page/ " + str(page)
            headers = {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0",
            }
            response = requests.get(url=url, headers=headers)
            if response and response.status_code == 200:
                content = response.content
                parse_content_queue.put(content)
            else:
                file_handle.write(self.thread_name + "网页下载失败" + url + "\n")

            file_handle.write(self.thread_name + "下载完成" + str(page)+"\n")


class ParseThread(threading.Thread):
    def __init__(self, thread_name):
        super(ParseThread, self).__init__()
        self.thread_name = thread_name

    def run(self):
        global parse_switch
        while parse_switch:
            file_handle.write("--------------------------" + "\n")
            content = parse_content_queue.get()
            doc = etree.HTML(content)
            file_handle.write(self.thread_name + "开始解析" + "\n")
            img_list = doc.xpath("//img/@src")
            for img in img_list:
                file_handle.write(img + "\n")
            file_handle.write(self.thread_name + "结束解析" + "\n")


if __name__ == '__main__':
    download_queue = Queue(maxsize=30)
    parse_content_queue = Queue()
    parse_switch = True

    for page in range(1, download_queue.maxsize + 1):
        download_queue.put(page)

    download_thread_names = [
        "下载器1",
        "下载器2",
        "下载器3",
        "下载器4",
    ]
    download_thread_list = []
    for thread_name in download_thread_names:
        d = DownloadThread(thread_name, download_queue)
        d.start()
        download_thread_list.append(d)

    while not download_queue.empty():
        pass

    for thread in download_thread_list:
        thread.join()

    parse_thread_names = [
        "解析器1",
        "解析器2",
        "解析器3",
    ]
    parse_thread_list = []
    for thread_name in parse_thread_names:
        p = ParseThread(thread_name)
        p.start()
        parse_thread_list.append(p)

    while not parse_content_queue.empty():
        pass

    parse_switch = False

    for thread in parse_thread_list:
        thread.join()