需要安装的库requests
和pandas
2019年8月12日测试可用,下面是源码,复制粘贴即可用
import requests,pandas
def parse_page(keyword,pn):
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
headers={
"Referer":"https://www.lagou.com/jobs/list_?labelWords=&fromSearch=true&suginput=", #搜索页默认的 Referer
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
}
data = {
"first":"true",
"pn":"%d"%pn,
"kd":"%s"%keyword
}
session = requests.session()
session.get(headers["Referer"],headers=headers) # 发起一次搜索请求来保存cookie
return session.post(url,headers=headers,data=data).json()
def parse_infos(infos):
all_jobs=infos['content']['positionResult']['result'] # 定位到所有职位信息
jobs=[]
for job in all_jobs:
jobs.append([
job["companyFullName"],
job["companyShortName"],
job["companySize"],
job["financeStage"],
job["district"],
job["positionName"],
job["workYear"],
job["education"],
job["salary"],
job['jobNature'],
job['positionAdvantage'],
job['industryField'],
"http://www.lagou.com/jobs/%s.html"%job['positionId']
])
return jobs
def main():
keyword = input('请输入查询岗位:')
infos = parse_page(keyword,1)
count=int(infos['content']['positionResult']['totalCount']) #总搜索结果
page_sum=(count-1)//15+1
print("共查询到 %d 条与\"%s\"相关的岗位信息,一共 %d 页"%(count,keyword,page_sum))
all_infos=[]
for i in range(1,page_sum+1):
infos = parse_page(keyword,i)
print("开始解析第 %d 页"%i)
all_infos+=parse_infos(infos)
print("数据获取完成,开始写入数据")
data=pandas.DataFrame(data=all_infos,columns=['公司全名', '公司简称', '公司规模', '融资阶段', '区域', '职位名称', '工作经验', '学历要求', '月薪', '工作性质','职位福利','行业','链接'])
data.to_csv("lagoudata.csv",encoding='utf-8-sig')
print("已导入到 lagoudata.csv !")
if __name__ =="__main__":
main()
运行欧克图:
再看一眼获取到的数据:【防止违规发帖,已经打码】