当第一个Rule的href正则匹配到目录数据之后进行follow跟进,这时候第二个Rule开始进行href正则匹配获取跟进下一页的数据
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dangdang1.items import Dangdang1Item
class BookSpider(CrawlSpider):
name = 'book'
allowed_domains = ['book.dangdang.com', 'category.dangdang.com']
start_urls = ['http://book.dangdang.com/']
count = 0
rules = (
Rule(LinkExtractor(allow=r'http://category.dangdang.com/cp([0-9]|\.)+\.html'), follow=True),
Rule(LinkExtractor(
allow=r'/pg[0-9]+-cp([0-9]|\.)+\.html'), callback='get_book_info', follow=True, process_links='process_links'),
)
def process_links(self, links):
for link in links:
if 'http' not in link.url:
print('================')
link.url += 'http://category.dangdang.com'
yield link
def parse(self, response):
print(response.url)
def get_book_info(self, response):
books = response.xpath('//ul[@id="component_59"]/li')
for book in books:
name = book.xpath('.//img/@alt').extract_first()
price = book.xpath(
'./p[@class="price"]/span[@class="search_now_price"]/text()').extract_first()
src = book.xpath('.//img/@data-original').extract_first()
if not src:
src = book.xpath('.//img/@src').extract_first()
print(name)
yield Dangdang1Item(name=name, price=price, src=src)
from itemadapter import ItemAdapter
class Dangdang1Pipeline:
def open_spider(self, spider):
self.fp = open('book.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(str(item))
return item
def close_spider(self, spider):
self.fp.close()
import scrapy
class Dangdang1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
src = scrapy.Field()