import re
import requests
from lxml import etree
headers = {
"Referer": 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=true&fromSearch=true&labelWords=&suginput=',
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
"Cookie": '_ga=GA1.2.1419718633.1534239933; user_trace_token=20180814174533-c99d914f-9fa6-11e8-bbd0-525400f775ce; LGUID=20180814174533-c99d956c-9fa6-11e8-bbd0-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; WEBTJ-ID=20180822150634-1656074d52d7dc-03f6c87d5877c8-9393265-1049088-1656074d52e14e; _gid=GA1.2.528259101.1534921594; PRE_HOST=www.baidu.com; LGSID=20180822150647-ef43be6e-a5d9-11e8-9d2e-525400f775ce; PRE_UTM=m_cf_cpc_baidu_pc; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.0s0000KlmKuCTsdU2p1IY8MijcJLOkObhE2GJJ3Bx3RAlBtKV5WaeZeVdx8AukzavfM6CcZcRarNSRDA6NTAbqfmwWuizpRVTbXv3d_x1mjBdhOqDW9263H0euRCsjCq85sBrbdiABgBOWn2nIosVy_0HET2LpUrYKn7UWCXTHijk8465s.7b_NR2Ar5Od663rj6tJQrGvKD7ZZKNfYYmcgpIQC8xxKfYt_U_DY2yP5Qjo4mTT5QX1BsT8rZoG4XL6mEukmryZZjzL4XNPIIhExzLu2SMcM-sSxH9vX8ZuEsSXej_qT5o43x5ksSEzseldPHV2XgZJyAp7WWgklX-f.U1Yk0ZDqs2v4VnL30ZKGm1Yk0Zfqs2v4VnL30A-V5HcsP0KM5yF-TZns0ZNG5yF9pywd0ZKGujYk0APGujY1rjb0UgfqnH0kPdtknjD4g1DsnWPxn1msnfKopHYs0ZFY5HD40ANGujYkPjfYg1cknj61g1cvn1Rsg1cznjTY0AFG5HcsP0KVm1YLPWnznj6Yn1KxnH0snNtkg100TgKGujYs0Z7Wpyfqn0KzuLw9u1Ys0A7B5HKxn0K-ThTqn0KsTjYknHfzPjRzPHT30A4vTjYsQW0snj0snj0s0AdYTjYs0AwbUL0qn0KzpWYk0Aw-IWdsmsKhIjYs0ZKC5H00ULnqn0KBI1Ykn0K8IjYs0ZPl5fKYIgnqn1n3PHRLnjcdrHfvrj0LnjTzn0Kzug7Y5HDdn1f4nWDdrH0kPWb0Tv-b5yfkrj04uWI-nj0sPvNbmWf0mLPV5HckfWRkPjR4nbf3wWckP1f0mynqnfKsUWYs0Z7VIjYs0Z7VT1Ys0ZGY5H00UyPxuMFEUHYsg1Kxn7ts0Aw9UMNBuNqsUA78pyw15HKxn7tsg100TA7Ygvu_myTqn0Kbmv-b5H00ugwGujYVnfK9TLKWm1Ys0ZNspy4Wm1Ys0Z7VuWYs0AuWIgfqn0KhXh6qn0Khmgfqn0KlTAkdT1Ys0A7buhk9u1Yk0Akhm1Ys0APzm1Yzn1m4rf%26ck%3D5940.3.165.193.271.214.333.159%26shh%3Dwww.baidu.com%26sht%3Dbaiduhome_pg%26us%3D1.0.2.0.4.1860.0%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26issp%3D1%26f%3D8%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26inputT%3D3542%26bc%3D110101; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpc_baidu_pc%26m_kw%3Dbaidu_cpc_bj_e110f9_d2162e_%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591; JSESSIONID=ABAAABAAAGGABCB4BD507DDC4105BC80C5B7BB4A3D2ECA4; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534316823,1534921594,1534921605,1534921616; TG-TRACK-CODE=search_code; _gat=1; LGRID=20180822150820-26dbd097-a5da-11e8-abfc-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534921698; SEARCH_ID=1d3f22a9d83a4dbbb6f8042a55b085a6'
}
def request_list_page():
url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
data = {
'first': 'false',
'pn': 1,
'kd': 'python'
}
for x in range(1, 14):
data['pn'] = x
response = requests.post(url, headers=headers, data=data)
result = response.json()
# print(result)
positions = result['content']['positionResult']['result']
for position in positions:
positionId = position['positionId']
# 拼接某个职位的url
position_url = 'https://www.lagou.com/jobs/%s.html' % positionId
parse_position_detail(position_url)
break
break
def parse_position_detail(url):
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
position_name = html.xpath("//span[@class='name']/text()")[0]
print(position_name)
job_request = html.xpath("//dd[@class='job_request']//span")
salary = job_request[0].xpath("./text()")[0].strip()
print(salary)
city = job_request[1].xpath("./text()")[0].strip()
# 去掉无用的字符
city = re.sub(r"[\s/]", "", city)
print(city)
work_year = job_request[2].xpath("./text()")[0].strip()
work_year = re.sub(r"[\s/]", "", work_year)
print(work_year)
education = job_request[3].xpath("./text()")[0].strip()
education = re.sub(r"[\s/]", "", education)
print(education)
# 转化成字符串["".join()]
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
print(desc)
def main():
request_list_page()
if __name__ == '__main__':
main()