PythonCrawler scrapy 获取全站数据

当第一个Rule的href正则匹配到目录数据之后进行follow跟进,这时候第二个Rule开始进行href正则匹配获取跟进下一页的数据

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dangdang1.items import Dangdang1Item


class BookSpider(CrawlSpider):
    name = 'book'
    allowed_domains = ['book.dangdang.com', 'category.dangdang.com']
    start_urls = ['http://book.dangdang.com/']

    count = 0
    rules = (
        Rule(LinkExtractor(allow=r'http://category.dangdang.com/cp([0-9]|\.)+\.html'), follow=True),
        Rule(LinkExtractor(
            allow=r'/pg[0-9]+-cp([0-9]|\.)+\.html'), callback='get_book_info', follow=True, process_links='process_links'),
    )
    
    def process_links(self, links):
        for link in links:
            if 'http' not in link.url:
                print('================')
                link.url += 'http://category.dangdang.com'
            yield link
        

    def parse(self, response):
        print(response.url)

    def get_book_info(self, response):
        books = response.xpath('//ul[@id="component_59"]/li')

        for book in books:
            name = book.xpath('.//img/@alt').extract_first()
            price = book.xpath(
                './p[@class="price"]/span[@class="search_now_price"]/text()').extract_first()
            src = book.xpath('.//img/@data-original').extract_first()
            if not src:
                src = book.xpath('.//img/@src').extract_first()
            print(name)
            yield Dangdang1Item(name=name, price=price, src=src)

from itemadapter import ItemAdapter


class Dangdang1Pipeline:
    def open_spider(self, spider):
        self.fp = open('book.json', 'w', encoding='utf-8')
    
    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item
    
    def close_spider(self, spider):
        self.fp.close()
import scrapy


class Dangdang1Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    src = scrapy.Field()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值