话不多说,之前分析过了。
值得注意的地方有五个
1、list_%E6%96%87%E7%A7%98? 不能为中文
2、city=%E5%85%A8%E5%9B%BD& 不能为中文
3、‘kd’: ‘文秘’ 可以为中午
4、给他足够长的时间 time.sleep(10)
5、建议在晚上2~3点在运行,此时可以将time.sleep的时间数减少
import json
import requests
import csv
import time
def get_json(url, datas):
my_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_%E6%96%87%E7%A7%98?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
"Content-Type": "application/x-www-form-urlencoded;charset = UTF-8"
}
time.sleep(10)
ses = requests.session() # 获取session
ses.headers.update(my_headers) # 更新
ses.get(
"https://www.lagou.com/jobs/list_%E6%96%87%E7%A7%98?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=")
content = ses.post(url=url, data=datas)
result = content.json()
info = result['content']['positionResult']['result']
info_list = []
for job in info:
information = []
information.append(job['positionId']) # 岗位对应ID
information.append(job['city']) # 岗位对应城市
information.append(job['companyFullName']) # 公司全名
information.append(job['companyLabelList']) # 福利待遇
information.append(job['district']) # 工作地点
information.append(job['education']) # 学历要求
information.append(job['firstType']) # 工作类型
information.append(job['formatCreateTime']) # 发布时间
information.append(job['positionName']) # 职位名称
information.append(job['salary']) # 薪资
information.append(job['workYear']) # 工作年限
info_list.append(information)
return info_list
def main():
page = int(input('请输入你要抓取的页码总数:'))
title = ['岗位id', '城市', '公司全名', '福利待遇', '工作地点', '学历要求', '工作类型', '发布时间', '职位名称', '薪资', '工作年限']
file = open('lagou.csv', 'w', newline='', encoding='utf-8')
writer=csv.writer(file)
writer.writerow(title)
for x in range(1, page + 1):
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
datas = {
'first': 'false',
'pn': x,
'kd': '文秘',
}
try:
infos = get_json(url, datas)
for info in infos:
writer.writerow(info)
print("第%s页正常采集" % x)
except Exception as msg:
print("第%s页出现问题" % x)
time.sleep(10)
infos=get_json(url=url,datas=datas)
for info in infos:
writer.writerow(info)
if __name__ == '__main__':
main()