Spider:
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com/']
def parse(self, response):
pass
CrawlSpider:
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class KaoyanSpider(CrawlSpider):
name = 'kaoyan'
allowed_domains = ['kaoyan365.cn']
start_urls = ['http://www.kaoyan365.cn/kaoyantiaoji/tiaojixinxi/158281.html']
rules = (
# 提取列表页的url地址
Rule(LinkExtractor(allow=r'position\.php\?&start=\d*?#a'), callback='parse_list', follow=True),
# 提取详情页的url地址
# Rule(LinkExtractor(allow=r'position_detail\.php\?id=\d*?&keywords=&tid=0&lid=0'), callback='parse_item')
)
def parse_list(self, response):
pass
RedisSpider:
class BookSpider(RedisSpider): # 继承自父类为RedisSpider
name = 'dang'
allowed_domains = ['dangdang.com'] # 手动指定allow_domains
# start_urls = ['http://dangdang.com/'] # 没有start_urls
redis_key = "dangdang" # 增加了一个redis_key的键
def parse(self, response):
pass