1、设备
scrapy 单机版:1台机器
scrapy 分布式:至少2台机器
2、linux环境配置
2.1 redis安装
2.2 python环境安装
借助anaconda管理python
具体见:linux anaconda安装和环境配置
3、代码修改
3.1 setting.py修改
在setting中添加如下代码
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = False # False :爬虫时不清空redis
SCHEDULER_FLUSH_ON_START = True # True:启动爬虫时清空redis
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.FifoQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.PriorityQueue"
# REDIS_URL = 'redis://172.16.1.101:6379'
REDIS_HOST = '172.16.1.101'
REDIS_PORT = 6379
HTTPERROR_ALLOWED_CODES = [403]
3.2 spider代码
1、爬虫的继承改为RedisCrawlSpider
# pip install scrapy-redis
from scrapy_redis.spiders import RedisCrawlSpider
class SampleSpider(RedisCrawlSpider):
pass
2、去掉allowed_domains、start_urls或者start_requests
3、爬虫的第一个url入口必须是parse或者自定义规则
def parse(self, response):
pass
3.3 扩展:当爬虫长期处于空闲时关闭爬虫
空闲时长为1小时即关闭爬虫
1、setting.py添加
MYEXT_ENABLED = True # 开启扩展
IDLE_NUMBER = 720 # 配置空闲持续时间单位为 360个 ,一个时间单位为5s。 360个:半小时
# 在 EXTENSIONS 配置,激活扩展
EXTENSIONS = {
'project.extensions.RedisSpiderSmartIdleClosedExensions': 500,
}
2、extensions.py
详见extensions附录
4 任务定时
4.1 redis机器crontab任务
# 爬虫启动和start_url
# comic_redis
5 0 * * * sh /home/hilqiqi0/workspace/crawler_redis/run_scrapy.sh sample >> /home/hilqiqi0/workspace/crawler_redis/log.txt 2>&1
15 0 * * * sh /home/hilqiqi0/workspace/crawler_redis/run_scrapy_redis_start_url.sh
4.2 其他爬虫机器crontab任务
# 其他的爬虫启动
# comic_redis
5 0 * * * sh /home/hilqiqi0/workspace/crawler_redis/run_scrapy.sh qq >> /home/hilqiqi0/workspace/crawler_redis/log.txt 2>&1
4.3 爬虫启动脚本:run_scrapy.sh
#!/bin/bash
# 不一样的python环境;若是统一安装就不必判断
# -f 参数判断 $file 是否存在
if [ -f "/data2/hilqiqi0/venv/bin/activate" ]; then
echo /data2/hilqiqi0/venv/bin/activate
source /data2/hilqiqi0/venv/bin/activate
fi
if [ ! -f "/data2/hilqiqi0/venv/bin/activate" ]; then
echo /etc/profile
source /etc/profile
# echo conda activate
# conda activate
fi
cur_dateTime="`date +%Y-%m-%d,%H:%M:%S`"
echo $cur_dateTime
cd /home/hilqiqi0/workspace/crawler_redis
echo $1
echo log/log_$1.txt
scrapy crawl $1 > log/log_$1.txt 2>&1 &
/data2/hilqiqi0/venv/bin/activate:python虚拟环境
4.4 爬虫启动脚本:run_scrapy_redis_start_url.sh
#!/bin/bash
source /etc/profile
redis-cli lpush sample:start_urls www.sample.com
5、附录
5.1extensions.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped Extensions
from scrapy import signals
from scrapy.exceptions import NotConfigured
class RedisSpiderSmartIdleClosedExensions(object):
def __init__(self, idle_number, crawler):
self.crawler = crawler
self.idle_number = idle_number
self.idle_list = []
self.idle_count = 0
@classmethod
def from_crawler(cls, crawler):
# first check if the extension should be enabled and raise
# NotConfigured otherwise
if not crawler.settings.getbool('MYEXT_ENABLED'):
raise NotConfigured
if not 'redis_key' in crawler.spidercls.__dict__.keys():
raise NotConfigured('Only supports RedisSpider')
# get the number of items from settings
idle_number = crawler.settings.getint('IDLE_NUMBER', 360)
# instantiate the extension object
ext = cls(idle_number, crawler)
# connect the extension object to signals
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
return ext
def spider_opened(self, spider):
spider.logger.info("opened spider {}, Allow waiting time:{} second".format(spider.name, self.idle_number*5))
def spider_closed(self, spider):
spider.logger.info("closed spider {}, Waiting time exceeded {} second".format(spider.name, self.idle_number*5))
def spider_idle(self, spider):
# 程序启动的时候会调用这个方法一次,之后每隔5秒再请求一次
# 当持续半个小时都没有spider.redis_key,就关闭爬虫
# 判断是否存在 redis_key
if not spider.server.exists(spider.redis_key):
self.idle_count += 1
else:
self.idle_count = 0
if self.idle_count > self.idle_number:
# 执行关闭爬虫操作
self.crawler.engine.close_spider(spider, 'Waiting time exceeded')