2020-11-25

最新推荐文章于 2022-01-01 22:11:48 发布

z1212313

最新推荐文章于 2022-01-01 22:11:48 发布

阅读量86

点赞数

文章标签： python cookie

本文链接：https://blog.csdn.net/z1212313/article/details/110127358

版权

最新拉钩网站数据爬取

import requests
import pandas as pd
def get_page(num,name):
    print('正在爬取第{}页'.format(num))
    urls = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'accept-encoding':'gzip, deflate, br',
        'accept-language':'zh-CN,zh;q=0.9',
        'cache-control':'no-cache',
        'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    }
    data={
        'first':'false',
        'pn':num,
        'kd':name,
        'sid':'7fd39764bafa4e92a0e99571f703693d'
    }
    s=requests.Session()
    #print('建立session',s)
    s.get(url=url,headers=headers,timeout=3)
    cookie=s.cookies
    #print('获取cookie:',cookie)
    res=requests.post(url=urls,headers=headers,data=data,cookies=cookie,timeout=3)
    #print(res.raise_for_status())
    res.encoding='utf-8'
    page_data=res.json()
    #print('响应请求的结果:',page_data)
    return page_data
def get_page_info(page_data):
    data=page_data['content']['positionResult']['result']
    result=[]
    for i in data:
        list=[]
        positionName=i['positionName'] #职位名称
        company=i['companyFullName'] #公司名称
        companysname=i['companyShortName'] #公司小名
        companysize=i['companySize'] #公司人数
        companyimage=i['companyLogo'] #公司照片
        firstType=i['firstType'] #公司种类1
        secondType=i['secondType'] #公司种类2
        major=i['thirdType'] #专业
        positionLables=i['positionLables'] #职位
        city=i['city'] #所在城市
        salary=i['salary'] #年薪
        workYear=i['workYear'] #工作经验
        education=i['education'] #学位
        list.append(positionName)
        list.append(company)
        list.append(companysname)
        list.append(companysize)
        list.append(companyimage)
        list.append(firstType)
        list.append(secondType)
        list.append(major)
        list.append(positionLables)
        list.append(city)
        list.append(salary)
        list.append(workYear)
        list.append(education)
        result.append(list)
    return result
def url_page(name):
    num=1
    results=[]
    while num<=30:
        page_data=get_page(num,name)
        result=get_page_info(page_data)
        results+=result
        num+=1
    else:
        print('已经爬取所有的页数')
    return results
def write_csv(results,name):
    df=pd.DataFrame(data=results,columns=['职位名称','公司名称','公司小名','公司人数','公司照片','公司种类1','公司种类2',\
                                          '专业','职位','所在城市','年薪','工作经验','学位'])
    filename=name+'.csv'
    df.to_csv(filename,index=False)
def main(name):
    results=url_page(name)
    write_csv(results,name)
if __name__=='__main__':
    name=input('请输入你要查询的职位:')
    main(name)