创建项目
scrapy startproject book_info
创建爬虫
cd book_info
scrapy genspider cl_book suning.com
爬虫文件编写cl_book.py
# -*- coding: utf-8 -*-
import scrapy
import re
from ..items import BookInfoItem
from bs4 import BeautifulSoup
# 为避免数据错乱,引用深拷贝
from copy import deepcopy
class ClBookSpider(scrapy.Spider):
name = 'cl_book'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com/']
def parse(self, response):
#print(response.text)
item = BookInfoItem()
# 1. 首先获取大分类
#dt = response.xpath('/html/body/div[6]/div/div[1]/div[1]/div[1]/div[4]/dl/dt')
m_items = response.xpath('//div[@class="menu-item"]')
for m_item in m_items:
#print(m_item)
dd_list = m_item.xpath('.//dd/a')
if dd_list:
item['p_class'] = m_item.xpath('.//dt/h3/a/text()').extract_first()
#print(item['p_class'])
for dd in dd_list:
if dd is None:
continue
#print(dd.xpath('./text()').extract_first())
item['s_class'] = dd.xpath('./text()').extract_first()
#print(dd.xpath('./@href').extract_first())
item['booklist_url'] = dd.xpath('./@href').extract_first()
if 'search.suning.com' in item['booklist_url']:
print(item['booklist_url'])
item['booklist_url_ci'] = ' '
else:
item['booklist_url_ci'] = item['booklist_url'].split('-')[1]
#print(item)
# yield scrapy.Request(url=item['booklist_url'],
# meta={'item':item},callback=self.parse_second)
yield scrapy.Request(url=item['booklist_url'],
meta={'item':deepcopy(item)},callback=self.parse_second)
def parse_second(self, response):
item = response.meta['item']
#print(item)
#print(response.url)
#print('parse_second end')
li_list = response.xpath('//ul[@class="clearfix"]/li')
#print(li_list)
for li in li_list:
#print(li.xpath('.//p[@class="sell-point"]/a/@href').extract_first())
item['book_url'] = 'https:' + li.xpath('.//p[@class="sell-point"]/a/@href').extract_first()
# 问题: 本来的儿童类的书,匹配到了医学类中了
# 原因: 因为scrpy异步通讯同时访问了item,导致匹配混乱
# 解决方法: 用深拷贝解决
#print(item['book_url'])
if item['book_url']:
# yield scrapy.Request(url=item['book_url'],
# meta={'item': item}, callback=self.parse_book_detail)
yield scrapy.Request(url=item['book_url'],
meta={'item': deepcopy(item)}, callback=self.parse_book_detail)
#print(item)
# 翻页
# https: // list.suning.com / emall / showProductList.do?ci = 502348 & pg = 03 & cp = 2 & il = 0 & iy = 0 & adNumber = 0 & n = 1 & ch = 4 & prune = 0 & sesab = ACBAABC & id = IDENTIFYING & cc = 010
# https://list.suning.com/emall/showProductList.do?ci=502348&pg=03&cp=2
# https://list.suning.com/emall/showProductList.do?ci=502348&pg=03&cp=3
# 每页的组成 ci和cp构成
# ci图书类别: item['booklist_url']中包含
# cp页码
all_Page = response.xpath('//span[@class="page-more"]/text()').extract_first()
#print(all_Page)
if all_Page:
#print(re.findall(r'\d+',all_Page))
iallPage = int(re.findall(r'\d+',all_Page)[0])
#print(iallPage)
#print(item['booklist_url_ci'])
if item['booklist_url_ci']:
for i in range(iallPage+1):
#print(i)
next_url = 'https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}'.format(item['booklist_url_ci'],i)
#print(next_url)
yield scrapy.Request(url=next_url,
meta={'item': response.meta['item']}, callback=self.parse_second)
#yield item
#yield item
#情况1:BookInfoPipeline process_item 管道中打印的item缺少
# booklist_url book_url book_name book_author book_press book_time
# {'book_url': 'https://product.suning.com/0071155354/12031082185.html',
# 'booklist_url': 'https://list.suning.com/1-502322-0.html',
# 'booklist_url_ci': '502322',
# 'p_class': '人文社科',
# 's_class': '历史'}
else:
print('all_Page is Null')
def del_str(self,de_str):
return de_str.replace('\t','').replace('\r','').replace('\n','').replace(' ','').strip()
def parse_book_detail(self,response):
item = response.meta['item']
#print(item,"parse_book_detail")
book_div = response.xpath('//div[@class="proinfo-main"]/div[@class="proinfo-title"]')
#print(book_div)
h1 = book_div.xpath('./h1')
#print(h1.extract_first())
# 目的问题,我无法获取h1标签的内容
# 细致分析后,发现问题,打印的时候发现多了个span标签---这个玩意是反爬吗?要怎么处理呢?
# 我是用BeautifulSoup解决
# <h1 id="itemDisplayName">
# <span id="superNewProduct" class="snew hide">超级新品</span>
# 小口袋中国经典故事(10册)愚公移山孔融让梨司马光砸缸守株待兔儿童读物0-3-6岁早教益智启蒙精美绘本中国经典性情养成故
#
# </h1>
# print(h1.xpath('./text()'))
# 转换为字符串,然后用soup提取
if h1:
soup = BeautifulSoup(h1.extract_first(), features='lxml')
#print(soup.h1.text.replace('超级新品',''))
item['book_name'] = soup.h1.text.replace('超级新品','')
item['book_name'] = self.del_str(item['book_name'])
#print(item['book_name'])
book_info = response.xpath('//ul[@class="bk-publish clearfix"]')
#print(book_info.extract_first())
if book_info:
soup = BeautifulSoup(book_info.extract_first(), features='lxml')
# print(soup.h1.text.replace('超级新品',''))
#item['book_name'] = soup.h1.text.replace('超级新品', '')
book_li = soup.find_all('li')
if len(book_li)>0 and book_li[0]:
book_author = book_li[0].text
book_author = self.del_str(book_author)
item['book_author'] = book_author
else:
item['book_author'] = ' '
if len(book_li)>1 and book_li[1]:
book_press = book_li[1].text
book_press = self.del_str(book_press)
item['book_press'] = book_press
else:
item['book_press'] = ' '
if len(book_li)>2 and book_li[2]:
book_time = book_li[2].text
book_time = self.del_str(book_time)
item['book_time'] = book_time
else:
item['book_time'] = ' '
else:
item['book_author'] = ' '
item['book_press'] = ' '
item['book_time'] = ' '
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BookInfoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 一级分类
p_class = scrapy.Field()
# 二级分类
s_class = scrapy.Field()
# 书列表页url
booklist_url = scrapy.Field()
# 书详情页url
book_url = scrapy.Field()
# 书名
book_name = scrapy.Field()
# 书作者
book_author = scrapy.Field()
# 书出版社
book_press = scrapy.Field()
# 书出版时间
book_time = scrapy.Field()
# 翻页需要的ci信息
booklist_url_ci = scrapy.Field()
start.py
from scrapy import cmdline
cmdline.execute(['scrapy','crawl','cl_book']) #方式2
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for book_info project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'book_info'
SPIDER_MODULES = ['book_info.spiders']
NEWSPIDER_MODULE = 'book_info.spiders'
LOG_LEVEL = 'WARNING'
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2816.400',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'book_info.middlewares.BookInfoSpiderMiddleware': 543,
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'book_info.pipelines.BookInfoPipeline': 300,
}