关于Scrapy 批量抓取顺序的调整

(一)深度优先

在之前Scrapy基本使用的demo中,会发现,爬虫抓取的顺序是深度优先的【抓取内容共有五页,每页有6条>>>在还没有把第一页内容都请求结束的时候,已经开始请求其他页面的内容了】

(二)广度优先

先上代码

1.mcdonalds.py
import scrapy
from Mcdonalds.items import McdonaldsItem


class McdonaldsSpider(scrapy.Spider):
    name = 'mcdonalds'
    allowed_domains = ['www.mcdonalds.com.cn']

    def start_requests(self):
        """作用同 start_url,返回起始页请求列表"""
        for page_num in range(1, 7):
            url = 'https://www.mcdonalds.com.cn/news/corporate?page=' + str(page_num)
            yield scrapy.Request(url, callback=self.parse, meta={'page_num': page_num})

    def parse(self, response, **kwargs):
        """起始页解析"""
        c_num = 0
        news_list = response.xpath('''//div[@class='news-center-list']/ul/li''')
        for li in news_list:
            c_num += 1
            title = li.xpath('''./h4/a/text()''')[0].extract()
            time = li.xpath('''./time/text()''').extract_first()
            detail_url = 'https://www.mcdonalds.com.cn' + li.xpath('''./h4/a/@href''')[0].extract()
            item = dict()
            item['title'] = title
            item['time'] = time
            item['detail_url'] = detail_url
            # 请求传参
            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'c_num': c_num, 'page_num': response.meta['page_num'], 'item': item})

    def parse_detail(self, response, **kwargs):
        """详情页解析"""
        print(f"-----------------第{response.meta['page_num']}页---第{response.meta['c_num']}个---------------")
        item = response.meta['item']
        detail_content = response.xpath('''//div[@class="cmsPage"]''')[0].extract()
        if detail_content:
            item['detail_content'] = detail_content
            item['page_num'] = response.meta['page_num']
            item['c_num'] = response.meta['c_num']
            yield item

2.settings.py

BOT_NAME = 'Mcdonalds'

SPIDER_MODULES = ['Mcdonalds.spiders']
NEWSPIDER_MODULE = 'Mcdonalds.spiders'

USER_AGENT = 'Mozilla/5.0 ...'

ROBOTSTXT_OBEY = False
# LOG_LEVEL = 'ERROR'


# 允许的最大深度,可以通过meta查看当前深度;0表示无深度
# DEPTH_LIMIT = 3
 
# 0表示深度优先Lifo(默认);1表示广度优先FiFo
# 后进先出,深度优先
# DEPTH_PRIORITY = 0

# 先进先出,广度优先
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'

CONCURRENT_REQUESTS = 2
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
   'Mcdonalds.pipelines.McdonaldsPipeline': 300,
   'Mcdonalds.pipelines.mysqlPipeline': 301,
    # 管道列表,数字代表优先级,数字越小优先级越高
}

3.piplines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class McdonaldsPipeline:
    fp = None

    # 重写父类方法,只在开始爬虫时调用一次
    def open_spider(self, spider):
        print('开始爬虫............')
        self.fp = open('./mcdonalds_1.txt', 'a+', encoding='utf-8')

    def process_item(self, item, spider):
        # 接收爬虫文件提交的 item 对象
        title = item['title']
        time = item['time']
        detail_content = item['detail_content']
        detail_url = item['detail_url']
        c_num = item['c_num']
        page_num = item['page_num']
        self.fp.write(f">>>第-{page_num}-页--第-{c_num}-个<<<\n{title}{time}\n{detail_content}\n{detail_url}\n\n")
        return item

    def close_spider(self, spider):
        print('结束爬虫............')
        self.fp.close()


class mysqlPipeline:
    conn = None

    # 重写父类方法,只在开始爬虫时调用一次
    def open_spider(self, spider):
        print('开始爬虫............')
        self.conn = pymysql.connect('***')  # 连接数据库

    def process_item(self, item, spider):
        # 接收爬虫文件提交的 item 对象
        title = item['title']
        time = item['time']
        detail_content = item['detail_content']
        detail_url = item['detail_url']
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute('insert into mcdonalds(`title`,`issueTime`,`content`,`url`) value(%s,%s,%s,%s);', [title, time, detail_content, detail_url])
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        print('结束爬虫............')
        self.cursor.close()
        self.conn.close()

在 settings.py 中,设置了
DEPTH_PRIORITY = 1 ,
SCHEDULER_DISK_QUEUE ,
SCHEDULER_MEMORY_QUEUE ,
先进先出,广度优先,同时 设置了
CONCURRENT_REQUESTS = 2,(并发2)

4.运行发现,【抓取内容共有6页,每页有6条>>>在并发为2的情况下,会先顺序请求第 1、3页内容,内容都请求结束后,才会顺序请求第 2、4页内容,最后顺序请求第 5、6页内容】这样会比之前有序很多。

(三)最后,请求可以采取队列或栈的方式,针对大型请求还是需要运用 Scrapy-redis, 这样会进一步精准控制每个请求,速度会更快。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值