#导入所需的包
import requests
from lxml import etree
import time
#定义一个函数获取,每一个详情页面的url,并将其传给parse_position_detail函数,进行解析
def request_list_page():
#定义请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_python?",
"Cookie": "WEBTJ-ID=20181204195941-1677916947628-09ddd577bd153a-5768397b-1049088-1677916947710d; _ga=GA1.2.511238332.1543924783; _gid=GA1.2.1010648621.1543924783; user_trace_token=20181204195854-f92b90e9-f7bb-11e8-8cbc-5254005c3644; LGSID=20181204195854-f92b9579-f7bb-11e8-8cbc-5254005c3644; LGUID=20181204195854-f92b97ad-f7bb-11e8-8cbc-5254005c3644; JSESSIONID=ABAAABAAAGGABCB38BDC8EA53CD82453DDE5F9B00C57D32; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=1c366f35294348c93a797a481e80ab90; sajssdk_2015_cross_new_user=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543926366,1543926403,1543926406,1543926424; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22%24device_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; _putrc=7B78D63385EF2027123F89F2B170EADC; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B77645; hasDeliver=0; gate_login_token=efa314b7d39529de0db2f00b18a6702b55e6cbbdb407b917aab9581a504d027f; SEARCH_ID=92e4eca71190423d8e3cd0f548c4ef1b; TG-TRACK-CODE=search_code; _gat=1; LGRID=20181204211530-ac78254e-f7c6-11e8-8cbc-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543929379",
"Host": "www.lagou.com",
"X-Anit-Forge-Code": "0",
"X-Anit-Forge-Token": "None",
"X-Requested-With": "XMLHttpRequest"
}
#所要请求的url,有ajax分析所得
url="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
#定义一个参数,请求的关键字,post请求参数为data,
data={
"first":"false",
#当前请求页数
"pn":1,
#查询的关键字
"kd":"python"
}
#定义一个循环访问页数
for x in range(1,14):
data["pn"]=x
response=requests.post(url,headers=headers,data=data)
#json方法,如果返回来的是json数据,那么这个方法会自动的load成字典
result=response.json()
positions=result["content"]["positionResult"]["result"]
# print(positions)
for position in positions:
positionId=position["positionId"]
position_url="https://www.lagou.com/jobs/5267750.html".format(positionId)
# print(position_url)
parse_position_detail(position_url)
#解析:详情页面
def parse_position_detail(url):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Cookie":"WEBTJ-ID=20181204195941-1677916947628-09ddd577bd153a-5768397b-1049088-1677916947710d; _ga=GA1.2.511238332.1543924783; _gid=GA1.2.1010648621.1543924783; user_trace_token=20181204195854-f92b90e9-f7bb-11e8-8cbc-5254005c3644; LGUID=20181204195854-f92b97ad-f7bb-11e8-8cbc-5254005c3644; JSESSIONID=ABAAABAAAGGABCB38BDC8EA53CD82453DDE5F9B00C57D32; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=1c366f35294348c93a797a481e80ab90; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22%24device_id%22%3A%22167792e9c807a-08793496a1ae7b-5768397b-1049088-167792e9c81402%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; _putrc=7B78D63385EF2027123F89F2B170EADC; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B77645; hasDeliver=0; TG-TRACK-CODE=search_code; _gat=1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; gate_login_token=b17b50a28b66502e1bae125431b6a5c839fcd68718733889b7ec96a1c2991711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543926403,1543926406,1543926424,1543930046; LGSID=20181204212638-3a7f10a4-f7c8-11e8-8cbc-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DOibpFP5C6LvdrMKKvOXdu961ZB0X6GuqsxHArcn4JzLsFqqtUaga76H80Rz6oFrG%26wd%3D%26eqid%3Da316524400685adb000000025c066bf5; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; SEARCH_ID=3647aeda7e1a49e3b9b031e28e40da85; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543930286; LGRID=20181204213037-c926f9d3-f7c8-11e8-8cbc-5254005c3644"
}
#定义一个代理,改代理来自快代理,
proxies = {
"http": "http://116.62.9.96:80",
}
response=requests.get(url,headers=headers,proxies=proxies)
text=response.content.decode("utf-8")
html=etree.HTML(text)
position_name=html.xpath("//span[@class='name']/text()")[0]
job_request_spans=html.xpath('//dd[@class="job_request"]//span')
salary=job_request_spans[0].xpath('.//text()')[0].strip()
city=job_request_spans[1].xpath('.//text()')[0].strip().replace("/","").strip()
work_year=job_request_spans[2].xpath('.//text()')[0].strip("/")
education=job_request_spans[3].xpath('.//text()')[0].strip("/").strip()
#.join(元组、列表、字典、字符串),将序列中的的元素以指定的字符链接生成一个新的字符串
desc="".join(html.xpath('//dd[@class="job_bt"]//text()')).strip()
def main():
request_list_page()
if __name__ =="__main__":
main()
ajax-爬取拉勾网
最新推荐文章于 2024-09-01 19:13:21 发布