GKD
import scrapy,json
from urllib.parse import urljoin
from copy import deepcopy
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com','3.cn']
start_urls = ['https://book.jd.com/booksort.html']
def parse(self, response):
dt_list = response.xpath('//div[@class="mc"]/dl/dt')
for dt in dt_list:
item = {}
item["large_title"] = dt.xpath('./a/text()').extract_first()
item["large_title_href"] = dt.xpath('./a/@href').extract_first()
em_list = dt.xpath('./following-sibling::dd[1]/em')
for em in em_list:
item["small_title"] = em.xpath('./a/text()').extract_first()
item["small_title_href"] = urljoin(response.url,em.xpath('./a/@href').extract_first())
if item["small_title"] is not None:
yield scrapy.Request(
item["small_title_href"],
callback=self.list_parse,
meta = {"item": deepcopy(item)}
)
def list_parse(self,response):
item = response.meta["item"]
li_list = response.xpath('//div[@id="plist"]/ul/li')
for li in li_list:
item["img"] = li.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()
item["name"] = li.xpath('.//div[@class="p-name"]/a/@title').extract_first()
item["book_href"] = urljoin(response.url,li.xpath('.//div[@class="p-img"]/a/@href').extract_first())
item["author"] = li.xpath('.//div[@class="p-bookdetails"]//span[@class="author_type_1"]/a/text()').extract()
item["press"] = li.xpath('.//div[@class="p-bookdetails"]//span[@class="p-bi-store"]/a/@title').extract_first()
item["date"] = li.xpath('.//div[@class="p-bookdetails"]//span[@class="p-bi-date"]/text()').extract_first().strip()
item["sku"] = li.xpath('.//div[@class="gl-i-wrap j-sku-item"]/@data-sku').extract_first()
yield scrapy.Request(
'https://p.3.cn/prices/mgets?&skuIds=J_{}'.format(item["sku"]),
callback=self.book_price,
meta= {'item':deepcopy(item)}
)
next_url = urljoin(response.url,response.xpath('//a[@class="pn-next"]/@href').extract_first())
if next_url is not None:
yield scrapy.Request(
next_url,
callback=self.list_parse,
meta={"item":deepcopy(item)}
)
def book_price(self,response):
item = response.meta["item"]
item["price"] = json.loads(response.text)[0]["op"]
print(item)
settings
BOT_NAME = 'book2'
SPIDER_MODULES = ['book2.spiders']
NEWSPIDER_MODULE = 'book2.spiders'
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"
LOG_LEVEL = "WARNING"
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
ROBOTSTXT_OBEY = False