分布式scrapy-redis搞一下当当

最新推荐文章于 2021-03-10 16:35:45 发布

dh0805dh

最新推荐文章于 2021-03-10 16:35:45 发布

阅读量132

点赞数

分类专栏： python自学练习 python自学笔记数据库文章标签：分布式 scrapy scrapy-redis

本文链接：https://blog.csdn.net/dh0805dh/article/details/90264163

版权

python自学笔记同时被 3 个专栏收录

26 篇文章 0 订阅

订阅专栏

python自学练习

15 篇文章 0 订阅

订阅专栏

数据库

5 篇文章 0 订阅

订阅专栏

GKD

# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import urljoin
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider


class DangdangSpider(RedisSpider):
    name = 'dangdang'
    allowed_domains = ['dangdang.com']
    #start_urls = ['http://book.dangdang.com/']
    redis_key = "dangdang"

    def parse(self, response):
        div_list = response.xpath('//div[@class="con flq_body"]/div')
        for div in div_list:
            item = {}
            item["large_title"] = div.xpath('./dl/dt//text()').extract()
            item["large_title"] = [i.strip() for i in item["large_title"] if len(i.strip())>0]
            dl_list = div.xpath('.//dl[@class="inner_dl"]')
            for dl in dl_list:
                item["middle_title"] = dl.xpath('./dt//text()').extract()
                item["middle_title"] = [i.strip() for i in item["middle_title"] if len(i.strip())>0]
                a_list = dl.xpath('./dd/a')
                for a in a_list:
                    item["small_title"] = dl.xpath('./dd/a/text()').extract_first()
                    item["small_href"] = dl.xpath('./dd/a/@href').extract_first()
                    if item["small_href"] is not None:
                        yield scrapy.Request(
                            item["small_href"],
                            meta={"item": deepcopy(item)},
                            callback=self.detail_parse
                        )

    def detail_parse(self,response):
        item = response.meta["item"]
        li_list = response.xpath('//ul[@class="bigimg"]/li').extract_first()
        # if li_list is not None:
        for li in li_list:
            item["img"] = li.xpath('./a/img/@data-original').extract_first()
            item["name"] = li.xpath('./p[@class="name"]/a/text()').extract_first()
            item["price"] = li.xpath('./p[@class="price"]/span/text()').extract_first()
            item["detail"] = li.xpath('./p[@class="detail"]/text()').extract_first()
            item["author"] = li.xpath('./p[@class="search_book_author"]/span[1]//text()').extract()
            item["date"] = li.xpath('./p[@class="search_book_author"]/span[2]/text()').extract_first()
            item["press"] = li.xpath('./p[@class="search_book_author"]/span[3]/a/text()').extract_first()
            yield item

        next_url = urljoin(response.url,response.xpath('//li[@class="next"]/a[text()=下一页]/@href'))
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                meta={"item": deepcopy(item)},
                callback=self.detail_parse
                )

settings

添加了

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"