果壳问答(队列)

# @Time : 2020/5/7 18:19
# @Author : GKL
# FileName : spider.py
# Software : PyCharm


import time
import json
import requests
from threading import Thread
from queue import Queue
from lxml import etree


class Spider(Thread):
    def __init__(self, url_queue):
        Thread.__init__(self)
        self.url_queue = url_queue
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }

    def run(self):
        while not self.url_queue.empty():
            url = self.url_queue.get()
            self.get_data(url)


    def get_data(self, url):
        response = requests.get(url, headers=self.headers).text
        page = etree.HTML(response)
        node_list = page.xpath('//ul[@class="ask-list-cp"]//li')
        for node in node_list:

            # 关注数
            follow = node.xpath('.//p[@class="ask-focus-nums"]/span/text()')[0]

            # 回答数
            answer = node.xpath('.//p[@class="ask-answer-nums"]/span/text()')[0]

            # 问题
            question = node.xpath('.//div[@class="ask-list-detials"]/h2/a/text()')[0]

            # 详情页
            href = node.xpath('.//div[@class="ask-list-detials"]/h2/a/@href')[0]

            # 标签
            tags = '-'.join(node.xpath('.//div[@class="ask-list-legend"]//a/text()'))

            items = {
                '关注数': follow,
                '回答数': answer,
                '问题': question,
                '详情页': href,
                '标签': tags
            }
            print(items)
            with open('result.json', 'a', encoding='utf-8') as f:
                f.write(json.dumps(items, ensure_ascii=False) + '\n')


if __name__ == '__main__':

    t1 = time.time()
    queue = Queue()

    url_list = ['https://www.guokr.com/ask/highlight/?page={}'.format(i) for i in range(1, 101)]
    for url in url_list:
        queue.put(url)

    t_list = []
    # 5个线程运行程序
    for _ in range(5):
        t = Spider(queue)
        t.start()
        t_list.append(t)

    # 阻塞主线程
    for t in t_list:
        t.join()

    print(time.time()-t1)


在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值