import requests
import csv
import time
import random
# 初始url
start_url = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
# 真实的url
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
# 设置header信息
header = {
# "Accept": "application/json, text/javascript, */*; q=0.01",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
# "Content-Length": "63",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
# "Cookie": "user_trace_token=20200407164400-b87322db-c1e7-4de7-ae6d-ca7222ae2b1d; LGUID=20200407164400-090ff63b-828f-47f0-bdd0-25a48b67e65a; _ga=GA1.2.1088314258.1586249042; _gid=GA1.2.105962977.1586249042; index_location_city=%E5%85%A8%E5%9B%BD; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2217153cfc9f3245-0c53312e76d2f1-f313f6d-2073600-17153cfc9f481c%22%2C%22%24device_id%22%3A%2217153cfc9f3245-0c53312e76d2f1-f313f6d-2073600-17153cfc9f481c%22%7D; JSESSIONID=ABAAABAAAEEAAIIF8998003D3B5E1A8F4C933A1D85306B5; WEBTJ-ID=20200408161130-17158d81dae9a-016ce58649722-f313f6d-2073600-17158d81daf7b9; PRE_UTM=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1586249042,1586261172,1586333491; LGSID=20200408161131-72d994e8-b7c6-4ee0-b89a-5c5177709b85; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DV8WamlooCDyoQBz2hVumMymXPEZuQGBPgpj1z%5F4mTDG%26ck%3D6305.3.94.403.153.184.303.287%26shh%3Dwww.baidu.com%26sht%3Dbaiduhome%5Fpg%26wd%3D%26eqid%3D8ee8ab85000508e4000000035e8d872f; TG-TRACK-CODE=index_search; SEARCH_ID=879751a166c8491dbc2f3cd42b9c93dc; X_HTTP_TOKEN=b33e713e9b530d407943336851b9be0b0df20f38b2; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1586333497; LGRID=20200408161138-89ca7bed-9983-49d5-90dd-98cd4d08ea1a",
"Host": "www.lagou.com",
# "Origin": "https://www.lagou.com",
"Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
# "Sec-Fetch-Dest": "empty",
# "Sec-Fetch-Mode": "cors",
# "Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
# "X-Anit-Forge-Code": "0",
# "X-Anit-Forge-Token": None,
# "X-Requested-With": "XMLHttpRequest",
}
kd = input('请输入您要查询的岗位')
# 循环翻页
for i in range(1, 31):
print(f'正在爬取第{i}页')
# post所需的参数
form_data = {
"first": "false",
"pn": i, # 翻页
"kd": kd, # 岗位名称
"sid": "669db70186ae4a17ab17dc30481afd3c"
}
if i % 10 == 1:
# 创建session对象
session = requests.Session()
# 发送初始请求 使session中包含所需信息
session.get(start_url, headers=header)
# 发送请求 获取响应
response = session.post(url, headers=header, data=form_data, cookies=session.cookies)
print(response.json())
if response.status_code == 200:
# 解析json数据 得到真实的岗位信息
info = (response.json()['content']['positionResult']['result'])
# 创建文件
with open('数据分析.csv', 'a', encoding='utf-8', newline='') as file:
# 每一页的岗位信息 得到每条信息
for data in info:
# 创建列表 将每条信息保存到列表中
list_data = []
list_data.append(data['positionName'])
list_data.append(data['companyShortName'])
list_data.append(data['companySize'])
list_data.append(data['industryField'])
list_data.append(data['financeStage'])
list_data.append(','.join(data['companyLabelList']))
list_data.append(data['salary'])
list_data.append('工作年限:' + data['workYear'])
list_data.append(data['education'])
list_data.append(data['city'])
# 输出岗位信息
print('岗位名称:' + data['positionName'] + '\t' +
"公司名称:" + data['companyShortName'] + '\t' +
"公司规模:" + data['companySize'] + '\t' +
"所属行业:" + data['industryField'] + '\t' +
"融资阶段:" + data['financeStage'] + '\t' +
'公司福利:' + ','.join(data['companyLabelList']) + '\t' +
'薪资阶段:' + data['salary'] + '\t' +
'工作年限:' + data['workYear'] + '\t' +
'学历要求:' + data['education'] + '\t' +
'工作地点:' + data['city']
)
# 保存到CSV中
ow = csv.writer(file)
ow.writerow(list_data)
# 休息防止反爬
# time.sleep(random.random() * 2)
python实战-爬取拉钩指定岗位数据
最新推荐文章于 2024-07-24 03:41:39 发布