j借鉴scrapy五大核心模块

downloader

import requests
from scrapy_plus.http.response import Response

class Downloader(object):
def get_response(self, request):
‘’’
发送响应,获取数据
:param request:
:return: 数据,返回给spider
‘’’
if request.method.upper() == ‘GET’:
resp = requests.get(request.url, params=request.params, headers=request.headers)
elif request.method.upper() == ‘POST’:
resp = requests.post(request.url, params=request.params, headers=request.headers, data=request.data)
else:
raise Exception(‘非法请求’)

    return Response(resp.url, resp.status_code, resp.headers, resp.content)

spider

from scrapy_plus.http.request import Request
from scrapy_plus.item import Item

class Spider(object):
start_url = []

def start_requests(self):
    '''
    构造第一次请求的request对象
    :return:

    '''
    for url in self.start_url: # 实现发送多次请求

        yield Request(url)

def parse(self, response):  # 实现多次解析数据
    '''
    获取响应.返回数据给管道
    :param response:
    :return:
    '''

    yield Item(response.body)

scheduler

from six.moves.queue import Queue

class Scheduler(object):
def init(self):
self.queue = Queue()

def add_request(self, request):
    '''
    添加request对象
    :param request:
    :return:
    '''
    self.queue.put(request)

def get_request(self):
    try:
        request = self.queue.get(False)
    except:
        return None
    else:
        return request

def _filter_request(self):
    '''请求去重'''
    # 暂时不实现
    pass

engine

from scrapy_plus.http.request import Request
from .scheduler import Scheduler
from .spider import Spider
from .downloader import Downloader
import importlib
from .pipeline import Pipeline

from scrapy_plus.middle.downloadermiddle import DownloaderMid
from scrapy_plus.middle.spidermiddle import SpiderMid
from scrapy_plus.utils.log import logger
from datetime import datetime
from scrapy_plus.conf.settings import DOWNLOADERMID, SPIDERMMID, PIPELINES, SPIDERS
import time

class Engine(object):
def init(self):
self.spiders = self._get_object(SPIDERS, is_spider=True) # 爬虫 字典
self.pipelines = self._get_object(PIPELINES) # 管道 列表
self.dloaders = self._get_object(DOWNLOADERMID) # 下载中间件 列表
self.spidermids = self._get_object(SPIDERMMID) # 爬虫中间件 列表
self.scheduler = Scheduler() # 调度器
self.downloader = Downloader() # 下载器
self.total_request = 0 # 记录请求的个数
self.total_response = 0 # 记录响应的个数

def _get_object(self, path, is_spider=False):  # 动态的获取对象
    if is_spider:
        instance = {}
    else:
        instance = []
    for i in path:
        module = i.rsplit('.', 1)[0]

        cls_name = i.rsplit('.', 1)[-1]

        model = importlib.import_module(module)  # 通过字符串路径获取目录文件

        cls = getattr(model, cls_name)  # 通过包和名字获取类对象名

        if is_spider:
            instance[cls().name] = cls()
        else:
            instance.append(cls())

    return instance

def _get_request(self):
    for spider_name, spider in self.spiders.items():  # 遍历字典,提取spider对象
        for request in spider.start_requests():  # 获取request对象
            for spidermid in self.spidermids:  # 获取爬虫中间件的对象
                request = spidermid.process_request(request)  # 爬虫中间件处理第一次request
            request.spider_name = spider_name  # 添加标识爬虫对象
            # TODO 过滤去重

            self.scheduler.add_request(request)  # 添加在队列之中
            self.total_request += 1

def _get_one_response(self):

    request = self.scheduler.get_request()  # 从队列中提取出来

    if request is None:
        return  # 如果为空,则停止程序
    for dloader in self.dloaders:  # 遍历下载中间件的对象
        request = dloader.process_request(request)  # 下载中间件处理过来的请求

    response = self.downloader.get_response(request)  # 获取响应
    for dloader in self.dloaders:
        response = dloader.process_response(response)  # 下载中间件处理响应
    for spidermid in self.spidermids:
        response = spidermid.process_response(response)  # 爬虫中间件处理响应
    response.meta = request.meta  # 把request中的参数复制给响应
    spider = self.spiders[request.spider_name]  # 获取spider对象.因为是字典

    fun = getattr(spider, request.parse)  # self.spider类对象名,request.parse为函数名字的字符还,调用该对象下的方法

    for result in fun(response):  # 获取解析后的数据

        # 6.1 如果是请求对象,那么就再交给调度器
        if isinstance(result, Request):  # 判断数据是否为request
            for spidermid in self.spidermids:
                result = spidermid.process_request(result)  # 处理提取的request对象,经过爬虫中间件给调度器
                result.spider_name = request.spider_name  # 给后面的请求对象添加标识,下次提取正确spider对象
                self.scheduler.add_request(result)
                self.total_request += 1
        else:
            for pipeline in self.pipelines:  # 遍历出管道的对象
                result = pipeline.process_item(result, spider)  # 解析数据
    self.total_response += 1

def _start_engine(self):
    self._get_request()
    while True:
        self._get_one_response()
        if self.total_response >= self.total_request:
            break

def start(self):
    '''启动整个引擎'''
    start = datetime.now()  # 起始时间
    logger.info("开始运行时间:%s" % start)  # 使用日志记录起始运行时间
    self._start_engine()
    stop = datetime.now()  # 结束时间
    logger.info("开始运行时间:%s" % stop)  # 使用日志记录结束运行时间
    logger.info("耗时:%.2f" % (stop - start).total_seconds())  # 使用日志记录运行耗时
    logger.info('发送了{}个请求'.format(self.total_request))
    logger.info('生成了{}个请求'.format(self.total_response))

pipeline

class Pipeline(object):
def process_item(self,item):
print(item)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值