1.主要是找到要爬取的数据在哪个资源文件下
爬取的网页连接https://hr.163.com/job-list.html
实际上我们获取数据的资源文件url:https://hr.163.com/api/hr163/position/queryPage
2.看数据格式
3.分析完成,代码如下
# 爬取网易职位信息
import pandas as pd
import requests
key = input("请输入你要查找的职位名称:")
def fetch_job_data(key):
url = 'https://hr.163.com/api/hr163/position/queryPage'
# 第一次请求获取总职位数
data = {"currentPage": 1, "pageSize": 1, "keyword": key}
res = requests.post(url, json=data)
json_data = res.json()
total = json_data['data']['total']
# 第二次请求获取所有职位信息
data = {
"currentPage": 1,
"keyword": key,
"pageSize": total,
}
res = requests.post(url, json=data)
json_data = res.json()
return json_data['data']['list']
job_data = fetch_job_data(key)
work = []
for item in job_data:
# 工作岗位名称
position = item.get("name")
# 岗位类型
postTypeFullName = item.get("postTypeFullName")
# 职位描述
description = item.get("description")
# 技术要求
requirement = item.get("requirement")
# 部门
firstDepName = item.get("firstDepName")
# 工作地点
workPlaceNameList = item.get("workPlaceNameList")
# 学历要求
reqEducationName = item.get("reqEducationName")
if reqEducationName is None:
reqEducationName = "不限"
print(reqEducationName)
# 工作经验
reqWorkYearsName = item.get("reqWorkYearsName")
if reqWorkYearsName is None:
reqWorkYearsName = "不限"
print(reqWorkYearsName)
# 保存在列表中
work.append({
"工作岗位名称": position,
"岗位类型": postTypeFullName,
"职位描述": description.replace('\n', '<br>'), # 替换换行符
"部门": firstDepName,
"技术要求": requirement.replace('\n', '<br>'),
"工作地点": ", ".join(workPlaceNameList),
"学历要求": reqEducationName,
"工作经验": reqWorkYearsName
})
save = pd.DataFrame(work)
save.to_csv(f"网易职位信息-{key}.csv")
#save.to_excel(f"网易职位信息-{key}.xlsx", index=False)