使用场景
爬取解析的数据不在用一个页面中,深度爬取
需求
爬取招聘网站某类的岗位名称、岗位描述
import scrapy
from spider.bossPro.bossPro.items import BossproItem
class BossSpider(scrapy.Spider):
name = 'boss'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.zhipin.com/c101020100/?query=Java&industry=&position=&ka=hot-position-1']
def parse_detail(self, response):
# 回调函数接收item,请求传参的接收
item = response.meta['item']
job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
job_desc = ''.join(job_desc)
# print(job_desc)
item['job_desc'] = job_desc
# 提交给管道
yield item
def parse(self, response):
li_list = response.xpath('//*[@id="main"]/div/div[2]/ul/li')
# print(li_list)
for li in li_list:
job_name = li.xpath('.//div[@class="job-title"]/span[1]/a/text()').extract_first()
# print(job_name)
item = BossproItem()
item['job_name'] = job_name
detail_url = 'https://www.zhipin.com' + li.xpath('.//div[@class="job-title"]/span[1]/a/@href').extract_first()
# 对详情页发送请求获取详情页页面源码数据
# 手动请求的发送
# 请求传参,将meta字典传递给请求对应的回调函数
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})