糗事百科_基于队列和多线程

import threading
import time
from queue import Queue

import requests
from lxml import etree


class QiuBaiSpider(object):
    # 1.爬取的的网站,和请求头
    def __init__(self):
        self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
        self.headers = {
            'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
        self.data = 0
        self.url_queue = Queue()
        self.response_queue = Queue()
        self.data_queue = Queue()
        self.count = 0

    # 2.爬取网站的url
    def get_url_list(self):
        for i in range(1, 13):
            url = self.base_url.format(i)
            self.url_queue.put(url)

    # 3.发送请求
    def send_request(self):
        while True:
            url = self.url_queue.get()
            response = requests.get(url, headers=self.headers)
            self.response_queue.put(response)
            self.url_queue.task_done()

    # 4. 解析数据
    def analysis_data(self):
        while True:
            data = self.response_queue.get().content
            self.count += 1
            html_data = etree.HTML(data)
            div_list = html_data.xpath("""//*[@id="content-left"]/div""")
            for i in div_list:
                text = i.xpath('.//h2/text()')[0]
                self.data += 1
                self.data_queue.put(text)
            self.response_queue.task_done()

    # 5.存储
    def write_file(self):
        while True:
            data = self.data_queue.get()
            self.data_queue.task_done()

    def _start(self):
        th_list = []
        # 获取url
        th_url = threading.Thread(target=self.get_url_list)
        th_list.append(th_url)

        # 发起请求
        for i in range(2):
            th_send = threading.Thread(target=self.send_request)
            th_list.append(th_send)

        # 解析数据
        th_analysis = threading.Thread(target=self.analysis_data)
        th_list.append(th_analysis)

        th_save = threading.Thread(target=self.write_file)
        th_list.append(th_save)
        print(th_list)
        # 开启线程保护,和开启线程
        for th in th_list:
            th.setDaemon(True)
            th.start()
        # 开启队列阻塞
        for q in [self.url_queue, self.response_queue, self.data_queue]:
            q.join()

    def run(self):
        start = time.time()
        self._start()
        end = time.time()
        print(end - start, "结束时间")
        print(self.data)


if __name__ == '__main__':
    qiu_bai = QiuBaiSpider()
    qiu_bai.run()

 

转载于:https://www.cnblogs.com/ls1997/p/11276819.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值