记录一种爬取拉勾网的方式作学习交流
一般的网站在爬取的时候,使用requests库发出请求,headers里面包括referer和user-agent两项就可以得到正确数据,而拉勾网的反爬机制却可以识别这种简单的爬虫。所以我在未登录的情况下,首先向拉勾网首页发起get请求,将对话信息保存在session,然后用这个session去请求拉勾网传输职位信息的真正的url,就可以成功爬取到职位数据(以python岗位为例)了。
但是这种方式存在很大的弊端,如果网站的反爬机制更新,将会花费大量的时间修改代码,所以推荐使用下一篇博客的方式
链接: https://blog.csdn.net/qq_41801603/article/details/105648107.
附代码:
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import time
import re
#第一次get请求的headers
headers2 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
}
#获取会话信息
def getsession():
headers1 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
'Referer':'https://www.lagou.com/'
}
url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
session = requests.session()
session.get(url,headers = headers1)
return session
#爬取python岗位页面对应职位的url
def request_list_page():
url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false"
data = {'first': 'false',
'pn': '1',
'kd': 'python'
}
session = getsession()
for x in range(1,10):
data['pn'] = x
response = session.post(url,headers = headers2,data = data)
#如果时间过快,会被识别为爬虫
time.sleep(3)
result = response.json()
infos = result['content']['positionResult']['result']
for info in infos:
positionid = info['positionId']
positionurl = 'https://www.lagou.com/jobs/%s.html'%positionid
#爬取对应职位信息
parse_detail_url(positionurl)
#对爬取的职位数据进行处理
def parse_detail_url(url):
session = getsession()
response = session.get(url,headers = headers2)
text = response.text
htmlelement = etree.HTML(text)
name = htmlelement.xpath('//h1[@class = "name"]/text()')[0]
job_request = htmlelement.xpath('//dd[@class="job_request"]//span')
salary = job_request[0].xpath('.//text()')[0].strip()
city = job_request[1].xpath('./text()')[0].strip()
city = re.sub(r"[\s/]","",city)
work_years = job_request[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]","",work_years)
education = job_request[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]", "",education)
desciption = "".join(htmlelement.xpath("//dd[@class='job_bt']//text()"))
dict = {
'name':name,
'salary':salary,
'city':city,
'work_years':work_years,
'education':education,
'desciption':desciption
}
print(dict)
def main():
request_list_page()
if __name__ == '__main__':
main()