crawlspider介绍及其源码分析

1. spider文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class Sohu2Spider(CrawlSpider):
    name = 'sohu2'
    allowed_domains = ['aaa']
    start_urls = ['http://aaa/']

    rules = (
        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        return item

2. 简单介绍

crawlspider是spider提供的一个通用spider,在spider文件里,通过Rule定义一些爬取规则来实现页面的抓取;Rule里包含提取和跟进页面的配置。spider会根据Rule来确定当前页面中哪些链接需要继续爬取,哪些页面爬取结果需要用哪些方法解析等

3. crawlspider源码

"""
This modules implements the CrawlSpider which is the recommended spider to use
for scraping typical web sites that requires crawling pages.

See documentation in docs/topics/spiders.rst
"""

import copy
import warnings

import six

from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request, HtmlResponse
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.python import get_func_args
from scrapy.spiders import Spider


def _identity(request, response):
    return request


def _get_method(method, spider):
    if callable(method):
        return method
    elif isinstance(method, six.string_types):
        return getattr(spider, method, None)


class Rule(object):

    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
        self.link_extractor = link_extractor
        self.callback = callback
        self.cb_kwargs = cb_kwargs or {}
        self.process_links = process_links
        self.process_request = process_request or _identity
        self.process_request_argcount = None
        self.follow = follow if follow is not None else not callback

    def _compile(self, spider):
        self.callback = _get_method(self.callback, spider)
        self.process_links = _get_method(self.process_links, spider)
        self.process_request = _get_method(self.process_request, spider)
        self.process_request_argcount = len(get_func_args(self.process_request))
        if self.process_request_argcount == 1:
            msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
            warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)

    def _process_request(self, request, response):
        """
        Wrapper around the request processing function to maintain backward
        compatibility with functions that do not take a Response object
        """
        args = [request] if self.process_request_argcount == 1 else [request, response]
        return self.process_request(*args)


class CrawlSpider(Spider):

    rules = ()

    def __init__(self, *a, **kw):
        super(CrawlSpider, self).__init__(*a, **kw)
        self._compile_rules()

    def parse(self, response):
        return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)

    def parse_start_url(self, response):
        return []

    def process_results(self, response, results):
        return results

    def _build_request(self, rule, link):
        r = Request(url=link.url, callback=self._response_downloaded)
        r.meta.update(rule=rule, link_text=link.text)
        return r

    def _requests_to_follow(self, response):
        if not isinstance(response, HtmlResponse):
            return
        seen = set()
        for n, rule in enumerate(self._rules):
            links = [lnk for lnk in rule.link_extractor.extract_links(response)
                     if lnk not in seen]
            if links and rule.process_links:
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
                request = self._build_request(n, link)
                yield rule._process_request(request, response)

    def _response_downloaded(self, response):
        rule = self._rules[response.meta['rule']]
        return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)

    def _parse_response(self, response, callback, cb_kwargs, follow=True):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and self._follow_links:
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item

    def _compile_rules(self):
        self._rules = [copy.copy(r) for r in self.rules]
        for rule in self._rules:
            rule._compile(self)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider._follow_links = crawler.settings.getbool(
            'CRAWLSPIDER_FOLLOW_LINKS', True)
        return spider

4. 源码解析

crawlspider继承自spider类。除了spider类所有方法和属性,还有一些它自带的属性和方法

  • rules:爬取规则属性,包含一个或多个Rule对象的元祖,crawlspider会读取rules的每一个Rule并进行解析
4.1 Rule源文件
class Rule(object):

    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
    	pass
    
4.2 参数解析
  • link_extractor:是LinkExtractor对象,定义需要爬取的链接;提取出来的链接会自动生成Request
class LxmlLinkExtractor(FilteringLinkExtractor):

    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                 tags=('a', 'area'), attrs=('href',), canonicalize=False,
                 unique=True, process_value=None, deny_extensions=None, restrict_css=(),
                 strip=True, restrict_text=None):
                 pass
  • allow:是一个正则表达式,定义当前页面提取的链接哪些是符合要求的,只有符合要求的链接才会被跟进
  • deny:作用和allow相反
  • allow_domains:定义符合要求的域名
  • deny_domains:作用和allow_domains相反
  • restrict_xpaths:定义通过xpath匹配链接;可结合allow使用,限定url范围
  • restrict_css: 定义通过css匹配链接
  • callback:回调函数,注意避免使用parse作为回调函数,因为crawlspider通过parse方法来实现其逻辑,如果parse方法覆盖,crawlspider将运行失败
  • cb_kwargs:传递给毁掉函数的参数
  • follow:布尔值,指定规则是否需要跟进;callback参数为None时,follow默认为True;callback参数有回调函数时,follow默认为false
  • process_links:指定处理函数,提取链接列表时,该函数将会被调用,主要用于过滤
  • process_request:处理函数,根据Rule提取每个Request 时,该函数将会被调用,对Request进行处理,该函数必须返回Request或者None
发布了260 篇原创文章 · 获赞 49 · 访问量 9821
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 书香水墨 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览