果壳问答（队列）

最新推荐文章于 2024-06-09 18:26:29 发布

guokanglun

最新推荐文章于 2024-06-09 18:26:29 发布

阅读量227

点赞数

本文链接：https://blog.csdn.net/gklcsdn/article/details/105995413

版权

# @Time : 2020/5/7 18:19
# @Author : GKL
# FileName : spider.py
# Software : PyCharm


import time
import json
import requests
from threading import Thread
from queue import Queue
from lxml import etree


class Spider(Thread):
    def __init__(self, url_queue):
        Thread.__init__(self)
        self.url_queue = url_queue
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }

    def run(self):
        while not self.url_queue.empty():
            url = self.url_queue.get()
            self.get_data(url)


    def get_data(self, url):
        response = requests.get(url, headers=self.headers).text
        page = etree.HTML(response)
        node_list = page.xpath('//ul[@class="ask-list-cp"]//li')
        for node in node_list:

            # 关注数
            follow = node.xpath('.//p[@class="ask-focus-nums"]/span/text()')[0]

            # 回答数
            answer = node.xpath('.//p[@class="ask-answer-nums"]/span/text()')[0]

            # 问题
            question = node.xpath('.//div[@class="ask-list-detials"]/h2/a/text()')[0]

            # 详情页
            href = node.xpath('.//div[@class="ask-list-detials"]/h2/a/@href')[0]

            # 标签
            tags = '-'.join(node.xpath('.//div[@class="ask-list-legend"]//a/text()'))

            items = {
                '关注数': follow,
                '回答数': answer,
                '问题': question,
                '详情页': href,
                '标签': tags
            }
            print(items)
            with open('result.json', 'a', encoding='utf-8') as f:
                f.write(json.dumps(items, ensure_ascii=False) + '\n')


if __name__ == '__main__':

    t1 = time.time()
    queue = Queue()

    url_list = ['https://www.guokr.com/ask/highlight/?page={}'.format(i) for i in range(1, 101)]
    for url in url_list:
        queue.put(url)

    t_list = []
    # 5个线程运行程序
    for _ in range(5):
        t = Spider(queue)
        t.start()
        t_list.append(t)

    # 阻塞主线程
    for t in t_list:
        t.join()

    print(time.time()-t1)

在这里插入图片描述

guokanglun

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
果壳问答（队列）

# @Time : 2020/5/7 18:19# @Author : GKL# FileName : spider.py# Software : PyCharmimport timeimport jsonimport requestsfrom threading import Threadfrom queue import Queuefrom lxml import etr...
复制链接

扫一扫