1.将猴子补丁和协程池导入
from gevent import monkey
monkey.patch_all()
from gevent.pool import Pool
2.创建协程池和数据队列对象
self.pool = Pool()
self.url_queue = Queue()
self.page_queue = Queue()
self.data_queue = Queue()
代码示例
from gevent import monkey
monkey.patch_all()
from gevent.pool import Pool
import requests
from lxml import etree
import json
from queue import Queue
import time
死循环装饰器
def run_forever(func):
def forever(obj):
while True:
func(obj)
return forever
class JiubaiSpider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
self.url_pattern = 'https://www.qiushibaike.com/8hr/page/{}/'
# url 队列
self.url_queue = Queue()
# 响应队列
self.page_queue = Queue()
# 数据队列
self.data_queue = Queue()
# 创建线程池对象
self.pool = Pool()
def add_url_to_queue(self):
# 把URL添加url队列中
for i in range(1, 14):
self.url_queue.put(self.url_pattern.format(i))
@run_forever
def add_page_to_queue(self):
''' 发送请求获取数据 '''
pass
@run_forever
def add_dz_to_queue(self):
pass
def get_first_element(self, list):
'''获取列表中第一个元素,如果是空列表就返回None'''
pass
@run_forever
def save_dz_list(self):
'''把段子信息保存到文件中'''
pass
def run_use_more_task(self, func, count=1):
'''把func放到线程中执行, count:开启多少线程执行'''
for i in range(0, count):
self.pool.apply_async(func)
def run(self):
self.run_use_more_task(self.add_url_to_queue)
self.run_use_more_task(self.add_page_to_queue, 3)
self.run_use_more_task(self.add_dz_to_queue, 2)
self.run_use_more_task(self.save_dz_list, 2)
# 等待0.1s让任务能有执行机会
time.sleep(0.01)
# 使用队列join方法,等待队列任务都完成了才结束
self.url_queue.join()
self.page_queue.join()
self.data_queue.join()
if name == ‘main’:
qbs = JiubaiSpider()
qbs.run()