import requests
import time
from lxml import etree
import re
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.360',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'Accept': 'application/json, text/javascript, */*; q=0.01'
}
def get_url():
url_start = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
url_parse = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
#cookies={'Cookie':'user_trace_token=20200329002625-f9f5792d-941e-4091-a525-338455cb678e; LGUID=20200329002625-e426697a-104e-472a-acec-57d9772e6995; _ga=GA1.2.274064110.1585412785; index_location_city=%E5%85%A8%E5%9B%BD; lagou_utm_source=B; JSESSIONID=ABAAABAABAGABFA54850B9CC64906DCA2818FB1CD32FEF6; WEBTJ-ID=20200418190102-1718cf2ed0f1fe-04d7de2e43c295-4313f6a-1049088-1718cf2ed101e1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1585558987,1586095974,1586352255,1587207663; _gid=GA1.2.1575314079.1587207663; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221712aae307614c-00c6951830c8d1-4313f6a-1049088-1712aae30778b8%22%2C%22%24device_id%22%3A%221712aae307614c-00c6951830c8d1-4313f6a-1049088-1712aae30778b8%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; gate_login_token=343f8443a90a85100c4dd44f03ef5ee978e1e1a91bf08f3167123728625752dc; _putrc=419FC39D9FD9C5C8123F89F2B170EADC; login=true; unick=%E7%94%A8%E6%88%B78912; privacyPolicyPopup=false; hasDeliver=0; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20200418202430-aa3c94cd-6652-4486-b0be-c8a02691194c; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=6ea807640c080d2f0004127851ecae8f8421ef7ea9; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1587213999; LGRID=20200418204641-9930ae6d-523c-4995-878b-621db461aa8f; SEARCH_ID=d1325576a2714673a0a7101a03bd69ee'}
#rl='https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
data={'first':'true',
'pn':'1',
'kd':'python'}
for i in range(1,5):
data['pn']=str(i)
s = requests.Session()
s.get(url_start, headers=headers, timeout=3) # 请求首页获取cookies
cookie = s.cookies # 为此次获取的cookies
response = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3) # 获取此次文本
time.sleep(3)
result=response.json()
positions=result['content']['positionResult']['result']
for position in positions:
id=position['positionId']
url='https://www.lagou.com/jobs/%d.html?show=d9ba2bf5a5c14b838db35a714a40f123'%id
parse_url(url)
break
break
def parse_url(url):
#print(url)
response=requests.get(url,headers=headers)
text =response.text
#print(text)
html=etree.HTML(text)
position_name=html.xpath("//h1[@class='name']/text()")[0]
job_request_spans=html.xpath("//dd[@class='job_request']//span")
salary=job_request_spans[0].xpath('.//text()')[0].strip()
city=job_request_spans[1].xpath('.//text()')[0].strip()
city=re.sub(r"[\s/]",'',city)
experience = job_request_spans[2].xpath('.//text()')[0].strip()
experience = re.sub(r"[\s/]", '', experience)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]", '', education)
desc="".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
print(desc)
def main():
pass
if __name__=='__main__':
get_url()