# @Time : 2020/5/7 18:19
# @Author : GKL
# FileName : spider.py
# Software : PyCharm
import time
import json
import requests
from threading import Thread
from queue import Queue
from lxml import etree
class Spider(Thread):
def __init__(self, url_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def run(self):
while not self.url_queue.empty():
url = self.url_queue.get()
self.get_data(url)
def get_data(self, url):
response = requests.get(url, headers=self.headers).text
page = etree.HTML(response)
node_list = page.xpath('//ul[@class="ask-list-cp"]//li')
for node in node_list:
# 关注数
follow = node.xpath('.//p[@class="ask-focus-nums"]/span/text()')[0]
# 回答数
answer = node.xpath('.//p[@class="ask-answer-nums"]/span/text()')[0]
# 问题
question = node.xpath('.//div[@class="ask-list-detials"]/h2/a/text()')[0]
# 详情页
href = node.xpath('.//div[@class="ask-list-detials"]/h2/a/@href')[0]
# 标签
tags = '-'.join(node.xpath('.//div[@class="ask-list-legend"]//a/text()'))
items = {
'关注数': follow,
'回答数': answer,
'问题': question,
'详情页': href,
'标签': tags
}
print(items)
with open('result.json', 'a', encoding='utf-8') as f:
f.write(json.dumps(items, ensure_ascii=False) + '\n')
if __name__ == '__main__':
t1 = time.time()
queue = Queue()
url_list = ['https://www.guokr.com/ask/highlight/?page={}'.format(i) for i in range(1, 101)]
for url in url_list:
queue.put(url)
t_list = []
# 5个线程运行程序
for _ in range(5):
t = Spider(queue)
t.start()
t_list.append(t)
# 阻塞主线程
for t in t_list:
t.join()
print(time.time()-t1)
果壳问答(队列)
最新推荐文章于 2024-06-09 18:26:29 发布