scrapy_redis配置
一、配置spider文件
注释start_urls
start_urls = ['https://baidu.com']
增加redis_key
redis_key = 'taoche:start_urls'
spider继承
from scrapy_redis.spiders import RedisSpider
class TcSpider(RedisSpider):
start_urls文件用来导入链接
import redis
from taoche import settings
url = 'https://%s.taoche.com%s'
redis_client = redis.Redis('localhost')
for city in settings.CITY_CODE:
for cx in settings.MORE:
start_urls = url % (city, cx)
redis_client.lpush('taoche:start_urls', start_urls)
二、配置settings
主机setting配置:
###配置scrapy-redis调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
###配置url去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
#主机名
REDIS_HOST = 'localhost'
##端口号
REDIS_PORT = 6379
# REDIS_URL="redis://[user]:password@localhost:port"
从机setting配置:
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#爬取结束后,仍自动清空redis库
SCHEDULER_PERSIST = True
# ITEM_PIPELINES = {
# 'scrapy_redis.pipelines.RedisPipeline': 300
# }
REDIS_HOST = '10.10.86.254'
REDIS_PORT = 6379
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'