ajax-爬取拉勾网

#导入所需的包
import requests
from lxml import etree
import time


#定义一个函数获取,每一个详情页面的url,并将其传给parse_position_detail函数,进行解析
def request_list_page():
    #定义请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        "Referer": "https://www.lagou.com/jobs/list_python?",
        "Cookie": "WEBTJ-ID=20181204195941-1677916947628-09ddd577bd153a-5768397b-1049088-1677916947710d; _ga=GA1.2.511238332.1543924783; _gid=GA1.2.1010648621.1543924783; user_trace_token=20181204195854-f92b90e9-f7bb-11e8-8cbc-5254005c3644; LGSID=20181204195854-f92b9579-f7bb-11e8-8cbc-5254005c3644; LGUID=20181204195854-f92b97ad-f7bb-11e8-8cbc-5254005c3644; JSESSIONID=ABAAABAAAGGABCB38BDC8EA53CD82453DDE5F9B00C57D32; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=1c366f35294348c93a797a481e80ab90; sajssdk_2015_cross_new_user=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543926366,1543926403,1543926406,1543926424; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22%24device_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; _putrc=7B78D63385EF2027123F89F2B170EADC; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B77645; hasDeliver=0; gate_login_token=efa314b7d39529de0db2f00b18a6702b55e6cbbdb407b917aab9581a504d027f; SEARCH_ID=92e4eca71190423d8e3cd0f548c4ef1b; TG-TRACK-CODE=search_code; _gat=1; LGRID=20181204211530-ac78254e-f7c6-11e8-8cbc-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543929379",
        "Host": "www.lagou.com",
        "X-Anit-Forge-Code": "0",
        "X-Anit-Forge-Token": "None",
        "X-Requested-With": "XMLHttpRequest"
    }
    #所要请求的url,有ajax分析所得
    url="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
    #定义一个参数,请求的关键字,post请求参数为data,
    data={
        "first":"false",
        #当前请求页数
        "pn":1,
        #查询的关键字
        "kd":"python"
    }
    #定义一个循环访问页数
    for x in range(1,14):
        data["pn"]=x
        response=requests.post(url,headers=headers,data=data)
        #json方法,如果返回来的是json数据,那么这个方法会自动的load成字典
        result=response.json()
        positions=result["content"]["positionResult"]["result"]
        # print(positions)
        for position in positions:
            positionId=position["positionId"]
            position_url="https://www.lagou.com/jobs/5267750.html".format(positionId)
            # print(position_url)
            parse_position_detail(position_url)


#解析:详情页面
def parse_position_detail(url):
    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        "Cookie":"WEBTJ-ID=20181204195941-1677916947628-09ddd577bd153a-5768397b-1049088-1677916947710d; _ga=GA1.2.511238332.1543924783; _gid=GA1.2.1010648621.1543924783; user_trace_token=20181204195854-f92b90e9-f7bb-11e8-8cbc-5254005c3644; LGUID=20181204195854-f92b97ad-f7bb-11e8-8cbc-5254005c3644; JSESSIONID=ABAAABAAAGGABCB38BDC8EA53CD82453DDE5F9B00C57D32; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=1c366f35294348c93a797a481e80ab90; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22%24device_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; _putrc=7B78D63385EF2027123F89F2B170EADC; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B77645; hasDeliver=0; TG-TRACK-CODE=search_code; _gat=1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; gate_login_token=b17b50a28b66502e1bae125431b6a5c839fcd68718733889b7ec96a1c2991711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543926403,1543926406,1543926424,1543930046; LGSID=20181204212638-3a7f10a4-f7c8-11e8-8cbc-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DOibpFP5C6LvdrMKKvOXdu961ZB0X6GuqsxHArcn4JzLsFqqtUaga76H80Rz6oFrG%26wd%3D%26eqid%3Da316524400685adb000000025c066bf5; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; SEARCH_ID=3647aeda7e1a49e3b9b031e28e40da85; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543930286; LGRID=20181204213037-c926f9d3-f7c8-11e8-8cbc-5254005c3644"
    }
    #定义一个代理,改代理来自快代理,
    proxies = {
        "http": "http://116.62.9.96:80",
    }
    response=requests.get(url,headers=headers,proxies=proxies)
    text=response.content.decode("utf-8")
    html=etree.HTML(text)
    position_name=html.xpath("//span[@class='name']/text()")[0]
    job_request_spans=html.xpath('//dd[@class="job_request"]//span')
    salary=job_request_spans[0].xpath('.//text()')[0].strip()
    city=job_request_spans[1].xpath('.//text()')[0].strip().replace("/","").strip()
    work_year=job_request_spans[2].xpath('.//text()')[0].strip("/")
    education=job_request_spans[3].xpath('.//text()')[0].strip("/").strip()
    #.join(元组、列表、字典、字符串),将序列中的的元素以指定的字符链接生成一个新的字符串
    desc="".join(html.xpath('//dd[@class="job_bt"]//text()')).strip()



def main():
    request_list_page()

if __name__ =="__main__":
    main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值