downloader
import requests
from scrapy_plus.http.response import Response
class Downloader(object):
def get_response(self, request):
‘’’
发送响应,获取数据
:param request:
:return: 数据,返回给spider
‘’’
if request.method.upper() == ‘GET’:
resp = requests.get(request.url, params=request.params, headers=request.headers)
elif request.method.upper() == ‘POST’:
resp = requests.post(request.url, params=request.params, headers=request.headers, data=request.data)
else:
raise Exception(‘非法请求’)
return Response(resp.url, resp.status_code, resp.headers, resp.content)
spider
from scrapy_plus.http.request import Request
from scrapy_plus.item import Item
class Spider(object):
start_url = []
def start_requests(self):
'''
构造第一次请求的request对象
:return:
'''
for url in self.start_url: # 实现发送多次请求
yield Request(url)
def parse(self, response): # 实现多次解析数据
'''
获取响应.返回数据给管道
:param response:
:return:
'''
yield Item(response.body)
scheduler
from six.moves.queue import Queue
class Scheduler(object):
def init(self):
self.queue = Queue()
def add_request(self, request):
'''
添加request对象
:param request:
:return:
'''
self.queue.put(request)
def get_request(self):
try:
request = self.queue.get(False)
except:
return None
else:
return request
def _filter_request(self):
'''请求去重'''
# 暂时不实现
pass
engine
from scrapy_plus.http.request import Request
from .scheduler import Scheduler
from .spider import Spider
from .downloader import Downloader
import importlib
from .pipeline import Pipeline
from scrapy_plus.middle.downloadermiddle import DownloaderMid
from scrapy_plus.middle.spidermiddle import SpiderMid
from scrapy_plus.utils.log import logger
from datetime import datetime
from scrapy_plus.conf.settings import DOWNLOADERMID, SPIDERMMID, PIPELINES, SPIDERS
import time
class Engine(object):
def init(self):
self.spiders = self._get_object(SPIDERS, is_spider=True) # 爬虫 字典
self.pipelines = self._get_object(PIPELINES) # 管道 列表
self.dloaders = self._get_object(DOWNLOADERMID) # 下载中间件 列表
self.spidermids = self._get_object(SPIDERMMID) # 爬虫中间件 列表
self.scheduler = Scheduler() # 调度器
self.downloader = Downloader() # 下载器
self.total_request = 0 # 记录请求的个数
self.total_response = 0 # 记录响应的个数
def _get_object(self, path, is_spider=False): # 动态的获取对象
if is_spider:
instance = {}
else:
instance = []
for i in path:
module = i.rsplit('.', 1)[0]
cls_name = i.rsplit('.', 1)[-1]
model = importlib.import_module(module) # 通过字符串路径获取目录文件
cls = getattr(model, cls_name) # 通过包和名字获取类对象名
if is_spider:
instance[cls().name] = cls()
else:
instance.append(cls())
return instance
def _get_request(self):
for spider_name, spider in self.spiders.items(): # 遍历字典,提取spider对象
for request in spider.start_requests(): # 获取request对象
for spidermid in self.spidermids: # 获取爬虫中间件的对象
request = spidermid.process_request(request) # 爬虫中间件处理第一次request
request.spider_name = spider_name # 添加标识爬虫对象
# TODO 过滤去重
self.scheduler.add_request(request) # 添加在队列之中
self.total_request += 1
def _get_one_response(self):
request = self.scheduler.get_request() # 从队列中提取出来
if request is None:
return # 如果为空,则停止程序
for dloader in self.dloaders: # 遍历下载中间件的对象
request = dloader.process_request(request) # 下载中间件处理过来的请求
response = self.downloader.get_response(request) # 获取响应
for dloader in self.dloaders:
response = dloader.process_response(response) # 下载中间件处理响应
for spidermid in self.spidermids:
response = spidermid.process_response(response) # 爬虫中间件处理响应
response.meta = request.meta # 把request中的参数复制给响应
spider = self.spiders[request.spider_name] # 获取spider对象.因为是字典
fun = getattr(spider, request.parse) # self.spider类对象名,request.parse为函数名字的字符还,调用该对象下的方法
for result in fun(response): # 获取解析后的数据
# 6.1 如果是请求对象,那么就再交给调度器
if isinstance(result, Request): # 判断数据是否为request
for spidermid in self.spidermids:
result = spidermid.process_request(result) # 处理提取的request对象,经过爬虫中间件给调度器
result.spider_name = request.spider_name # 给后面的请求对象添加标识,下次提取正确spider对象
self.scheduler.add_request(result)
self.total_request += 1
else:
for pipeline in self.pipelines: # 遍历出管道的对象
result = pipeline.process_item(result, spider) # 解析数据
self.total_response += 1
def _start_engine(self):
self._get_request()
while True:
self._get_one_response()
if self.total_response >= self.total_request:
break
def start(self):
'''启动整个引擎'''
start = datetime.now() # 起始时间
logger.info("开始运行时间:%s" % start) # 使用日志记录起始运行时间
self._start_engine()
stop = datetime.now() # 结束时间
logger.info("开始运行时间:%s" % stop) # 使用日志记录结束运行时间
logger.info("耗时:%.2f" % (stop - start).total_seconds()) # 使用日志记录运行耗时
logger.info('发送了{}个请求'.format(self.total_request))
logger.info('生成了{}个请求'.format(self.total_response))
pipeline
class Pipeline(object):
def process_item(self,item):
print(item)