1.请求传参
使用场景:如果爬取解析的数据不在同一张页面中(深度爬取)
需求:爬取网站图片名称,图片描述【岗位名称和岗位描述不在同一张页面中】
spider.py
import requests
import scrapy
from imgPro.items import ImgproItem
class ImgSpider(scrapy.Spider):
name = 'img'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://sc.chinaz.com/tupian/'] #第一页URL
#其他分页的URL
new_url = 'https://sc.chinaz.com/tupian/index_%d.html'
page_num = 2
#回调函数接收item 【这里主要接收yield回调,对爬取URL中的URL详细页面数据】
def parse_detail(self,response):
item = response.meta['item']
name = response.xpath('//*[@class="smr"]/text()')[1].extract()
item['name'] = name
yield item
def parse(self, response):
div_list = response.xpath('//*[@id="container"]/div')
item = ImgproItem()
for div in div_list:
#网址深度爬取
src = 'http:' + div.xpath('./div/a/@href').extract_first()
na = div.xpath('./div/a/@alt').extract_first()
item['na'] = na
yield scrapy.Request(url=src,callback=self.parse_detail,meta={'item':item})
#分页操作
if self.page_num < 10:
urls = format(self.new_url%self.page_num)
self.page_num += 1
#分页操作,在置换了新页面的URL后,再完成新URL的数据解析,调用parse()方法实现数据解析
yield scrapy.Request(urls,callback=self.parse)
items.py
class ImgproItem(scrapy.Item):
# define the fields for your item here like:
na = scrapy.Field()
name = scrapy.Field()
# pass
pipelines.py
class ImgproPipeline:
def process_item(self, item, spider):
print(item) #打印出来
return item
settings.py
#将item_pipelines释放
ITEM_PIPELINES = {
'imgPro.pipelines.ImgproPipeline': 300,
}
问题:自己将spider.py中的allowed_domains = [‘www.xxx.com’]没有进行注释,导致前面一直无法爬取到数据,但是程序也一直没有报错!!!
仔细啊!!!