scrapy-redis京东图书

最新推荐文章于 2024-05-12 16:52:40 发布

dh0805dh

最新推荐文章于 2024-05-12 16:52:40 发布

阅读量161

点赞数

分类专栏： python自学笔记数据库 python自学练习文章标签： scrapy scrapy-redis

本文链接：https://blog.csdn.net/dh0805dh/article/details/90245550

版权

python自学笔记同时被 3 个专栏收录

26 篇文章

订阅专栏

python自学练习

15 篇文章

订阅专栏

数据库

5 篇文章

订阅专栏

GKD

# -*- coding: utf-8 -*-
import scrapy,json
from urllib.parse import urljoin
from copy import deepcopy

class JdSpider(scrapy.Spider):
    name = 'jd'
    allowed_domains = ['jd.com','3.cn']
    start_urls = ['https://book.jd.com/booksort.html']

    def parse(self, response):
        dt_list = response.xpath('//div[@class="mc"]/dl/dt')
        for dt in dt_list:
            item = {}
            item["large_title"] = dt.xpath('./a/text()').extract_first()
            item["large_title_href"] = dt.xpath('./a/@href').extract_first()
            em_list = dt.xpath('./following-sibling::dd[1]/em')
            # print('large_title' + item["large_title"])
            # print('large_title_href' + item["large_title_href"])
            for em in em_list:
                item["small_title"] = em.xpath('./a/text()').extract_first()
                item["small_title_href"] = urljoin(response.url,em.xpath('./a/@href').extract_first())
                # print('small_title' + item["small_title"])
                # print('small_title_href' + item["small_title_href"])
                if item["small_title"] is not  None:
                    yield scrapy.Request(
                        item["small_title_href"],
                        callback=self.list_parse,
                        meta = {"item": deepcopy(item)}
                )

    def list_parse(self,response):
        item = response.meta["item"]
        li_list = response.xpath('//div[@id="plist"]/ul/li')
        for li in li_list:
            item["img"] = li.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
            item["name"] = li.xpath('.//div[@class="p-name"]/a/@title').extract_first()
            item["book_href"] = urljoin(response.url,li.xpath('.//div[@class="p-img"]/a/@href').extract_first())
            item["author"] = li.xpath('.//div[@class="p-bookdetails"]//span[@class="author_type_1"]/a/text()').extract()
            item["press"] = li.xpath('.//div[@class="p-bookdetails"]//span[@class="p-bi-store"]/a/@title').extract_first()
            item["date"] = li.xpath('.//div[@class="p-bookdetails"]//span[@class="p-bi-date"]/text()').extract_first().strip()
            item["sku"] = li.xpath('.//div[@class="gl-i-wrap j-sku-item"]/@data-sku').extract_first()
            # print('sku:'+item["sku"])
            yield scrapy.Request(
                'https://p.3.cn/prices/mgets?&skuIds=J_{}'.format(item["sku"]),
                callback=self.book_price,
                meta= {'item':deepcopy(item)}
            )

        next_url = urljoin(response.url,response.xpath('//a[@class="pn-next"]/@href').extract_first())
        #print('next'+ next_url)
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                callback=self.list_parse,
                meta={"item":deepcopy(item)}
            )

    def book_price(self,response):
        item = response.meta["item"]
        item["price"] = json.loads(response.text)[0]["op"]
        print(item)

settings

BOT_NAME = 'book2'

SPIDER_MODULES = ['book2.spiders']
NEWSPIDER_MODULE = 'book2.spiders'

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"

LOG_LEVEL = "WARNING"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False