多线程爬虫之糗事百科

最新推荐文章于 2022-03-31 16:09:33 发布

大黄快跑

最新推荐文章于 2022-03-31 16:09:33 发布

阅读量302

点赞数

本文链接：https://blog.csdn.net/qwe9216/article/details/77938807

版权

# coding=utf-8
import requests
from retrying import retry
from lxml import etree
import json
from queue import Queue
import threading


class QiushiSpider(object):
    def __init__(self):
        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"}
        self.url_queue = Queue()
        self.html_queue = Queue()
        self.content_queue = Queue()

    def get_url_list(self):
        for i in range(1, 14):
            self.url_queue.put(self.url_temp.format(i))

    @retry(stop_max_attempt_number=3)
    def _parse_url(self, url):
        response = requests.get(url, headers=self.headers, timeout=5)
        assert response.status_code == 200
        html = etree.HTML(response.content)
        return html

    def parse_url(self):  # 2. 发送请求获取相应
        while 1:
            url = self.url_queue.get()
            print(" now parseing", url)
            try:
                html = self._parse_url(url)
            except Exception as e:
                print(e)
                html = None
            self.html_queue.put(html)
            self.url_queue.task_done()

    def get_content_list(self):
        while 1:
            html = self.html_queue.get()
            div_list = html.xpath("//div[contains(@id,'qiushi_tag_')]")
            content_list = []
            for div in div_list:
                item = {}
                src = div.xpath("./div[@class='author clearfix']/a[1]/img/@src")
                item["src"] = "https:" + src[0] if len(src) > 0 else None
                author_name = div.xpath("./div[@class='author clearfix']/a[1]/img/@alt")
                item["author_name"] = author_name[0] if len(author_name) > 0 else None
                author_gender = div.xpath(".//div[contains(@class,'articleGender')]/@class")
                item["author_gender"] = author_gender[0].split(" ")[-1].replace("Icon", "") if len(
                    author_gender) > 0 else None
                author_age = div.xpath(".//div[contains(@class,'articleGender')]/text()")
                item["author_age"] = author_age[0] if len(author_age) > 0 else None
                duanzi_content = div.xpath(".//div[@class='content']/span/text()")
                item["duanzi_content"] = [i.replace("\n", "") for i in duanzi_content]
                content_list.append(item)

            self.content_queue.put(content_list)
            self.html_queue.task_done()

    def save_content_list(self):
        while 1:
            content_list = self.content_queue.get()
            file_path = './qiushi.txt'
            with open(file_path, "a", encoding='utf-8') as f:
                for content in content_list:
                    f.write(json.dumps(content, ensure_ascii=False, indent=4))
                    f.write("\n")
            self.content_queue.task_done()

    def run(self):
        thread_list = []

        # 1.url_list
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)
        # 2.遍历url_list发送请求，获取响应
        for i in range(3):
            t_parse_url = threading.Thread(target=self.parse_url)
            thread_list.append(t_parse_url)

        # 3.提取数据
        t_get_content = threading.Thread(target=self.get_content_list)
        thread_list.append(t_get_content)
        # 4.保存
        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)

        for t in thread_list:
            t.setDaemon(True)
            t.start()
        for q in [self.url_queue,self.html_queue,self.content_queue]:
            q.join()


if __name__ == '__main__':
    qiushi = QiushiSpider()
    qiushi.run()