首先,在终端打开redis数据库!
第一种(单机爬虫)创建命令:scrapy genspider -t crawl bizhi netbian.com
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class BizhiSpider(CrawlSpider):
name = 'bizhi'
allowed_domains = ['netbian.com']
start_urls = ['http://pic.netbian.com/4kfengjing/']
page_link = LinkExtractor(allow='http://pic.netbian.com/4kfengjing/index_2.html')
rules = (
Rule(page_link, callback='get_detail', follow=True),
)
def get_detail(self,response):
print('--------------------')
li_list = response.xpath('//div[@class="slist"]/ul/li')
# print(li_list)
for li in li_list:
img = li.xpath('.//a/img/@src').extract_first('')
print(img)
title =li.xpath('.//a/b/text()').extract_first('')
print(title)
settings里面设置如下:
ITEM_PIPELINES = {
'bizhispider.pipelines.BizhispiderPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
#添加这两句代码
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy_redis的调度器,不使用scrapy默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
终端运行命令:scrapy crawl bizhi 即可
第二种(分布式)命令:scrapy genspider fengjing netbian.com
import scrapy
#引入
from scrapy_redis.spiders import RedisCrawlSpider
#两者选其一
#from scrapy_redis.spiders import RedisSpider,Rule
#继承RedisCrawlSpider
class FengjingSpider(RedisCrawlSpider):
name = 'fengjing'
allowed_domains = ['netbian.com']
# start_urls = ['http://pic.netbian.com/4kfengjing/index_3.html']
redis_key = 'fengjingspider:start_urls'
#规则
#rules = (
# Rule(page_link, callback='parse', follow=True),
# )
def parse(self, response):
print('--------------------')
li_list = response.xpath('//div[@class="slist"]/ul/li')
# print(li_list)
for li in li_list:
img = li.xpath('.//a/img/@src').extract_first('')
print(img)
title = li.xpath('.//a/b/text()').extract_first('')
print(title)
settings里面设置同上
终端运行命令:scrapy crawl fengjiang 等运行暂停后,打开一个新的cmd终端,输入命令:redis-cli
然后输入命令:lpush + redis-key值 + 需要请求的url,程序即可继续运行。