关于Scrapy 批量抓取顺序的调整

王不亏

已于 2022-09-21 11:14:04 修改

阅读量1.4k

点赞数 1

文章标签： scrapy python 爬虫

于 2022-09-21 11:09:07 首次发布

本文链接：https://blog.csdn.net/weixin_47177392/article/details/126967287

版权

这里写目录标题

（一）深度优先

在之前Scrapy基本使用的demo中，会发现，爬虫抓取的顺序是深度优先的【抓取内容共有五页，每页有6条>>>在还没有把第一页内容都请求结束的时候，已经开始请求其他页面的内容了】

（二）广度优先

先上代码

1.mcdonalds.py

import scrapy
from Mcdonalds.items import McdonaldsItem


class McdonaldsSpider(scrapy.Spider):
    name = 'mcdonalds'
    allowed_domains = ['www.mcdonalds.com.cn']

    def start_requests(self):
        """作用同 start_url,返回起始页请求列表"""
        for page_num in range(1, 7):
            url = 'https://www.mcdonalds.com.cn/news/corporate?page=' + str(page_num)
            yield scrapy.Request(url, callback=self.parse, meta={'page_num': page_num})

    def parse(self, response, **kwargs):
        """起始页解析"""
        c_num = 0
        news_list = response.xpath('''//div[@class='news-center-list']/ul/li''')
        for li in news_list:
            c_num += 1
            title = li.xpath('''./h4/a/text()''')[0].extract()
            time = li.xpath('''./time/text()''').extract_first()
            detail_url = 'https://www.mcdonalds.com.cn' + li.xpath('''./h4/a/@href''')[0].extract()
            item = dict()
            item['title'] = title
            item['time'] = time
            item['detail_url'] = detail_url
            # 请求传参
            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'c_num': c_num, 'page_num': response.meta['page_num'], 'item': item})

    def parse_detail(self, response, **kwargs):
        """详情页解析"""
        print(f"-----------------第{response.meta['page_num']}页---第{response.meta['c_num']}个---------------")
        item = response.meta['item']
        detail_content = response.xpath('''//div[@class="cmsPage"]''')[0].extract()
        if detail_content:
            item['detail_content'] = detail_content
            item['page_num'] = response.meta['page_num']
            item['c_num'] = response.meta['c_num']
            yield item

2.settings.py


BOT_NAME = 'Mcdonalds'

SPIDER_MODULES = ['Mcdonalds.spiders']
NEWSPIDER_MODULE = 'Mcdonalds.spiders'

USER_AGENT = 'Mozilla/5.0 ...'

ROBOTSTXT_OBEY = False
# LOG_LEVEL = 'ERROR'


# 允许的最大深度，可以通过meta查看当前深度；0表示无深度
# DEPTH_LIMIT = 3
 
# 0表示深度优先Lifo(默认)；1表示广度优先FiFo
# 后进先出，深度优先
# DEPTH_PRIORITY = 0

# 先进先出，广度优先
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'

CONCURRENT_REQUESTS = 2
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
   'Mcdonalds.pipelines.McdonaldsPipeline': 300,
   'Mcdonalds.pipelines.mysqlPipeline': 301,
    # 管道列表，数字代表优先级，数字越小优先级越高
}

3.piplines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class McdonaldsPipeline:
    fp = None

    # 重写父类方法，只在开始爬虫时调用一次
    def open_spider(self, spider):
        print('开始爬虫............')
        self.fp = open('./mcdonalds_1.txt', 'a+', encoding='utf-8')

    def process_item(self, item, spider):
        # 接收爬虫文件提交的 item 对象
        title = item['title']
        time = item['time']
        detail_content = item['detail_content']
        detail_url = item['detail_url']
        c_num = item['c_num']
        page_num = item['page_num']
        self.fp.write(f">>>第-{page_num}-页--第-{c_num}-个<<<\n{title}： {time}\n{detail_content}\n{detail_url}\n\n")
        return item

    def close_spider(self, spider):
        print('结束爬虫............')
        self.fp.close()


class mysqlPipeline:
    conn = None

    # 重写父类方法，只在开始爬虫时调用一次
    def open_spider(self, spider):
        print('开始爬虫............')
        self.conn = pymysql.connect('***')  # 连接数据库

    def process_item(self, item, spider):
        # 接收爬虫文件提交的 item 对象
        title = item['title']
        time = item['time']
        detail_content = item['detail_content']
        detail_url = item['detail_url']
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute('insert into mcdonalds(`title`,`issueTime`,`content`,`url`) value(%s,%s,%s,%s);', [title, time, detail_content, detail_url])
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        print('结束爬虫............')
        self.cursor.close()
        self.conn.close()