深入爬取图书信息,大分类到小分类再到详细信息页
(从中午放学一直做到了晚上18点,无线火力都不玩了)
主要代码如下
# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import re
class SuningSpider(scrapy.Spider):
name = 'suning'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com/?safp=d488778a.homepage1.99345513004.47&safpn=10001']
def parse(self, response):
"""大类别"""
div_list = response.xpath("//div[@class='menu-list']/div[@class='menu-item'][position()<9]")
for i in div_list:
item = {}
item["大类别"] = i.xpath(".//h3/a/text()").extract_first()
#少儿图书的网页不一样,在这里要分开做(算了,不做了)
if item["大类别"] == "少儿":
continue
#sm_url = i.xpath(".//h3/a/@href").extract_first()
#yield scrapy.Request(url=sm_url, callback=self.c_page, meta={"item": deepcopy(item)})
#提取小类别的url
sm_url = i.xpath(".//h3/a/@href").extract_first()
yield scrapy.Request(url=sm_url, callback=self.bg_url_details_page, meta={"item": deepcopy(item)})
def bg_url_details_page(self, response):
"""全部小类"""
item = response.meta["item"]
#小类别分组
a_list = response.xpath('//div[@id="search-path"]/dl//a')
for i in a_list:
item["小类别"] = i.xpath("./@title").extract_first()
#提取小类别下详情页url
sm_details_url = "https:" + i.xpath("./@href").extract_first()
yield scrapy.Request(url=sm_details_url, callback=self.sm_details_page, meta={"item": deepcopy(item)})
def sm_details_page(self, response):
"""详细小类"""
item = response.meta["item"]
li_list = response.xpath('//ul[@class="clearfix"]/li')
#构造书的详情页url地址
for a in li_list:
details_url = "https:" + a.xpath('.//div[@class="img-block"]/a/@href').extract_first()
yield scrapy.Request(url=details_url, callback=self.details_page, meta={"item": deepcopy(item)})
#构造每个小类别的下一页地址
for i in range(100):
next_url = "https://list.suning.com/1-502325-{}-0-0-0-0-0-0-4.html".format(i)
yield scrapy.Request(url=next_url, callback=self.parse, meta={"item": deepcopy(item)})
def details_page(self, response):
"""图书详情页提取数据"""
item = response.meta["item"]
item["标题"] = response.xpath('//div[@class="proinfo-title"]/h1/text()').extract()
item["标题"] = [re.sub(r"\s", "", i) for i in item["标题"]] #多余字符去掉
item["标题"] = [i for i in item["标题"] if len(i) > 0] #去掉空字符
item["作者"] = response.xpath('//div[@class="proinfo-main"]/ul/li[1]/text()').extract()
item["作者"] = [re.sub(r"\s", "", i) for i in item["作者"]] # 多余字符去掉
item["作者"] = [i for i in item["作者"] if len(i) > 0] # 去掉空字符
item["出版社"] = response.xpath('//div[@class="proinfo-main"]/ul/li[2]/text()').extract()
item["出版社"] = [re.sub(r"\s", "", i) for i in item["出版社"]] # 多余字符去掉
item["出版社"] = [i for i in item["出版社"] if len(i) > 0] # 去掉空字符
item["大图"] = "https:" + response.xpath('//div[@class="imgzoom-main"]/a/img/@src').extract_first()
item["服务"] = response.xpath('//dd[@id="proinfo-id"]/span[position()>2]/a//text()').extract()
item["服务"] = [re.sub(r"\s", "", i) for i in item["服务"]] # 多余字符去掉
item["服务"] = [i for i in item["服务"] if len(i) > 0] # 去掉空字符
#print(item)
yield item
def c_page(self, response):
"""处理少儿类别的图书"""
pass
#item = response.meta["item"]
#div_list = response.xpath('//div[@class="banner-nav"]/div[position()<4]')
#for i in div_list:
#item["小类别"] = i.xpath('./div/h4/a/text()').extract()
#print(item)
还有保存到数据库的(pipelines.py)
# -*- coding: utf-8 -*-
from pymongo import MongoClient
class SnspiderPipeline:
def process_item(self, item, spider):
client = MongoClient(host="127.0.0.1", port=27017)
db = client["suning"]
db.book_data.insert_one(dict(item)) #将数据插入数据库
print(item)
return item
代码跟运行截图在下面
PyCharm代码
运行结果代码
接下来是数据库的
最后
其实理清思路后没什么难的,就是一层层深入获取信息,总体慢慢到细节。
- 先获取总的图书分类
- 总的分类下面又有一堆小的分类
- 最后在每个小分类里请求详细页面提取信息
在不同的提取信息方法里传递item需要用到深拷贝,因为scrapy是高并发的,不是单线程的执行顺序,不用深拷贝的话后面提取的数据会把前面的数据覆盖,导致有很多的重复数据,在多层深入提取数据的时候要格外注意这点
(有句话是这么说的,你现在遇到的不懂的问题,其实别人早已已经解决了,遇到问题多上网找找,看看别人是怎么做的)
下面有个让人直掉头发的问题
图书的价格是动态生成的,找到是找到了,但地址是一坨恶心的东西懒得分析了,要花比较多的时间,找了三个地址贴在下面
https://ds.suning.com/ds/generalForTile/000000011565503563__2_0070755385,000000011811415867__2_0070418556,000000010544581440_,000000010672180054_,000000010728394111__2_0070088999,000000011557399193__2_0070129646,000000011124955920__2_0070852352,000000000101977802__2_0070166575,000000000659940249__2_0070167435,000000000122856180__2_0070121234-781-2-0070091633-1--ds0000000006215.jsonp?
https://ds.suning.com/ds/generalForTile/000000011712876602__2_0070067633,000000000646456032_,000000010229712282_,000000000655759316_,000000010580157816__2_0070213869,000000010620044359__2_0070091573,000000010586275791__2_0070213869,000000010572418369__2_0070221893,000000011565456784__2_0070755385,000000011174056743__2_0070847771-781-2-0070167435-1--ds0000000008788.jsonp?
https://ds.suning.com/ds/generalForTile/000000010572014716_,000000010806645938_,000000000646485946__2_0070167435,000000011171110489__2_0070852352,000000000825022214__2_0070083569,000000010574049519_,000000010291266705__2_0070167435,000000010565119840__2_0070067633,000000010766097948_,000000010572033876_-781-2-0070221893-1--ds0000000004886.jsonp?
特别恶心