import scrapy
from wangyi.items import WangyiItem
class JobSpider(scrapy.Spider):
name = "job"
allowed_domains = ["163.com"]
# 修改start——urls
start_urls = ["https://hr.163.com/api/hr163/position/queryPage"]
# 该网站是一个JSON数据渲染的动态网页,网页源码里面并没有数据,这个地方不能直接用Xpath解析
def parse(self, response):
# with open('itcast.json','wb') as f:
# f.write(response.json)
# 提取数据
# 获取所有职位节点列表
node_list = response.xpath('//*[@id="p-job-list"]/div[2]/div[2]/div/div/div[2]/div/div/div[2]')
# print(len(node_list))
# 遍历节点列表
for num, node in enumerate(node_list):
# 设置过滤条件,将目标节点获取出来
if num % 2 == 0:
item = WangyiItem()
item['name'] = node.xpath('').extract_first()
# ......
# .......
yield item
# 模拟翻页
part_url = response.xpath("").extract__first
# 判断终止条件
if part_url != ' javaxxx xxx':
next_url = response.urljoin(part_url)
yield scrapy.Request(url=next_url, callback=self.parse)
# 模拟翻页
——————以上为大体思路