效果:
![](https://img-blog.csdnimg.cn/img_convert/ae2c9673b2e225ff62140c288e211275.png)
![](https://img-blog.csdnimg.cn/img_convert/a91c9efbb04b7f6426a59026b4b90648.png)
![](https://img-blog.csdnimg.cn/img_convert/14555207e6ee343c06ab2ca7faea6afe.png)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
version_0
分析:
返回的数据是Json
大分类:https://lapi.jd.com/pool?body={%22pid%22:%220101947%22}&source=book,每个poolid对应一个大title
```
title: "文学综合馆",
operate_title: "",
sort_num: 2,
fid: 0,
id: 7369,
ext_info: {
poolId: "0101945"
}
```
部分数据,即title对应一个poolId,属于同一个字典下
拿到大分类的poolId,就可以用pooId取构造请求:https://lapi.jd.com/pool?body={%22pid%22:%220101945%22}&source=book,来获取中title以及小title,以及对应的小title的url
请求小title的url拿到图书列表,图书信息在https://list.jd.com/listNew.php?cat=1713%2C3260%2C3345&page=150,在这个地址中,有总的页数,SEARCH.adv_param={page:"150",page_count:"200",psort:"0",cid1: 0,
其中page_count为总的页数,page为当前页数,价格未有单独请求,包含在这个网址中,至此,网页分析完毕。
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
源代码
# -*- coding: utf-8 -*-
import scrapy
import json
from copy import deepcopy
from pprint import pprint
import re
from urllib import parse
class JdtsSpider(scrapy.Spider):
name = 'jdts'
# allowed_domains = ['book.jd.com','lapi.jd.com','coll.jd.com','list.jd.com','search.jd.com']
allowed_domains = ["jd.com"]
start_urls = ['https://lapi.jd.com/pool?body={%22pid%22:%220101947%22}&source=book']
def parse(self, response):
item = dict()
json_obj = json.loads(response.body.decode())
data_list = json_obj["data"][1:-4]
for data in data_list:
# 拿到大分类及poolId
item["b_title"] = data["title"]
item["poolId"] = data["ext_info"]["poolId"]
# 用poolId去请求中级分类和下级分类
temporary_var = '"pid":"{}"'.format(item["poolId"])
next_url = 'https://lapi.jd.com/pool?body={'+temporary_var+'}&source=book'
yield scrapy.Request(
url=next_url,
callback=self.parse_poolid,
meta={"item":item}
)
break
def parse_poolid(self,response):
item = response.meta["item"]
json_obj = json.loads(response.body.decode())
data_list = json_obj["data"][2:]
# 获取中级分类
for data in data_list:
item["m_title"] = data["title"]
s_item_list = data["children"]
# 获得下级分类
for s_item in s_item_list:
item["s_title"] = s_item["title"]
item["s_href"] = s_item["link_value"]
yield scrapy.Request(
url=item["s_href"],
callback=self.parse_s_href,
meta={"item":item}
)
def parse_s_href(self,response):
item = deepcopy(response.meta["item"])
book_info = dict()
print("-"*20)
print("响应的url:", response.url)
# with open ("test.html",'w',encoding='utf8') as f:
# f.write(response.body.decode())
# 拿到总页数
count_page = int(re.findall(r'page_count:\"(.*?)\"',response.body.decode(),re.S)[0])
print("count_page:",count_page)
# 获取书籍列表
content_list = re.findall(r'class="p-img">(.*?)<div class="p-commit">',response.body.decode(),re.S)
for content in content_list:
# 获取书籍信息
item["book_info"] = book_info
item["book_info"]["book_title"] = re.findall(r'<em>(.*?)</em>', content, re.S)[1]
item["book_info"]["book_href"] = re.findall(r'href="(.*?)"',content,re.S)[0]
item["book_info"]["book_href"] = parse.urljoin(response.url,item["book_info"]["book_href"])
item["book_info"]["book_price"] = re.findall(r'<i>(.*?)</i>', content, re.S)[0]
yield scrapy.Request(
url=item["book_info"]["book_href"],
callback=self.parse_detail,
meta={"item":deepcopy(item)}
)
num = 2 # 计数指针
while num<count_page:
if "&page=" in response.url:
next_page_href = response.url.split("&page",1)[0]+"&page={}".format(num)
# print("*"*20)
# print("next_page_url:", next_page_href)
# print("*"*20)
else:
next_page_href = response.url+"&page={}".format(num)
# print("next_page_url:",next_page_href)
yield scrapy.Request(
url=next_page_href,
callback=self.parse_s_href,
meta={"item":response.meta["item"]}
)
num += 1
def parse_detail(self,response):
item = response.meta["item"]
item["book_info"]["book_author"] = response.xpath("//div[@class='p-author']/a/text()").extract_first()
# url_1 = "https://dx.3.cn/desc/12234231" skudid
# url_2 = "https://cd.jd.com/description/channel?skuId=69612641897&mainSkuId=14541400416" skuid mainskuid
skuid = re.findall(r'com/(.*?)\.html',response.url,re.S)[0]
mainskuid = re.findall(r"mainSkuId:'(.*?)'",response.body.decode(),re.S)
if mainskuid:
mainskuid = mainskuid[0]
url = "https://cd.jd.com/description/channel?skuId={}&mainSkuId={}"
next_url = url.format(skuid,mainskuid)
else:
url = "https://dx.3.cn/desc/{}"
next_url = url.format(skuid)
item["book_info"]["book_description"] = next_url
print("*"*20)
print("描述文件获取地址:",next_url)
print("*"*20)
pprint(item)