这里写目录标题
(一)深度优先
在之前Scrapy基本使用的demo中,会发现,爬虫抓取的顺序是深度优先的【抓取内容共有五页,每页有6条>>>在还没有把第一页内容都请求结束的时候,已经开始请求其他页面的内容了】
(二)广度优先
先上代码
1.mcdonalds.py
import scrapy
from Mcdonalds.items import McdonaldsItem
class McdonaldsSpider(scrapy.Spider):
name = 'mcdonalds'
allowed_domains = ['www.mcdonalds.com.cn']
def start_requests(self):
"""作用同 start_url,返回起始页请求列表"""
for page_num in range(1, 7):
url = 'https://www.mcdonalds.com.cn/news/corporate?page=' + str(page_num)
yield scrapy.Request(url, callback=self.parse, meta={'page_num': page_num})
def parse(self, response, **kwargs):
"""起始页解析"""
c_num = 0
news_list = response.xpath('''//div[@class='news-center-list']/ul/li''')
for li in news_list:
c_num += 1
title = li.xpath('''./h4/a/text()''')[0].extract()
time = li.xpath('''./time/text()''').extract_first()
detail_url = 'https://www.mcdonalds.com.cn' + li.xpath('''./h4/a/@href''')[0].extract()
item = dict()
item['title'] = title
item['time'] = time
item['detail_url'] = detail_url
# 请求传参
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'c_num': c_num, 'page_num': response.meta['page_num'], 'item': item})
def parse_detail(self, response, **kwargs):
"""详情页解析"""
print(f"-----------------第{response.meta['page_num']}页---第{response.meta['c_num']}个---------------")
item = response.meta['item']
detail_content = response.xpath('''//div[@class="cmsPage"]''')[0].extract()
if detail_content:
item['detail_content'] = detail_content
item['page_num'] = response.meta['page_num']
item['c_num'] = response.meta['c_num']
yield item
2.settings.py
BOT_NAME = 'Mcdonalds'
SPIDER_MODULES = ['Mcdonalds.spiders']
NEWSPIDER_MODULE = 'Mcdonalds.spiders'
USER_AGENT = 'Mozilla/5.0 ...'
ROBOTSTXT_OBEY = False
# LOG_LEVEL = 'ERROR'
# 允许的最大深度,可以通过meta查看当前深度;0表示无深度
# DEPTH_LIMIT = 3
# 0表示深度优先Lifo(默认);1表示广度优先FiFo
# 后进先出,深度优先
# DEPTH_PRIORITY = 0
# 先进先出,广度优先
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
CONCURRENT_REQUESTS = 2
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
'Mcdonalds.pipelines.McdonaldsPipeline': 300,
'Mcdonalds.pipelines.mysqlPipeline': 301,
# 管道列表,数字代表优先级,数字越小优先级越高
}
3.piplines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class McdonaldsPipeline:
fp = None
# 重写父类方法,只在开始爬虫时调用一次
def open_spider(self, spider):
print('开始爬虫............')
self.fp = open('./mcdonalds_1.txt', 'a+', encoding='utf-8')
def process_item(self, item, spider):
# 接收爬虫文件提交的 item 对象
title = item['title']
time = item['time']
detail_content = item['detail_content']
detail_url = item['detail_url']
c_num = item['c_num']
page_num = item['page_num']
self.fp.write(f">>>第-{page_num}-页--第-{c_num}-个<<<\n{title}: {time}\n{detail_content}\n{detail_url}\n\n")
return item
def close_spider(self, spider):
print('结束爬虫............')
self.fp.close()
class mysqlPipeline:
conn = None
# 重写父类方法,只在开始爬虫时调用一次
def open_spider(self, spider):
print('开始爬虫............')
self.conn = pymysql.connect('***') # 连接数据库
def process_item(self, item, spider):
# 接收爬虫文件提交的 item 对象
title = item['title']
time = item['time']
detail_content = item['detail_content']
detail_url = item['detail_url']
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into mcdonalds(`title`,`issueTime`,`content`,`url`) value(%s,%s,%s,%s);', [title, time, detail_content, detail_url])
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
print('结束爬虫............')
self.cursor.close()
self.conn.close()
在 settings.py 中,设置了
DEPTH_PRIORITY = 1 ,
SCHEDULER_DISK_QUEUE ,
SCHEDULER_MEMORY_QUEUE ,
先进先出,广度优先,同时 设置了
CONCURRENT_REQUESTS = 2,(并发2)