需要导入的包
import requests
首先爬虫需要url请求以及验证的信息(这里以腾讯招聘为例子)
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
params = {
"pageIndex": "2",
"pageSize": "10",
"language": "zh-cn",
"area": "cn",
}
# params 表示为需要添加的参数(图中可以运用替换的方法)
发送请求获取源代码以及解析源代码
get = requests.get(url=url, headers=headers, params=params).json()
# .json()是json的解析方法
代码的遍历
post = get['Data']['Posts'][0]
print(post)
posts = get['Data']['Posts'][0]['LocationName']
print(posts)
或者运用for循环
Posts = get['Data']['Posts']
for i in Posts:
LocationName = i['LocationName']
LastUpdateTime = i['LastUpdateTime']
RecruitPostName = i['RecruitPostName']
print(LocationName, LastUpdateTime, RecruitPostName)
爬取多页的数据只需要更改数据的页码
for page in range(1, 101):
params['pageIndex'] = page
# 写入一个for循环,更改页数,达到爬取多页的效果(记得写到网页请求的前面,把所有的写入循环)
params = {
#"pageIndex": "2",
"pageSize": "10",
"language": "zh-cn",
"area": "cn",
}
# 记得把页码的参数注释掉