数据分析师招聘岗位分析
1.本文的目的和内容
1.1.本文的目的:
通过分析能够了解公司对于数据分析岗位的要求及待遇
1.2本文的内容:
主要针对以下几个问题:
1.数据分析岗位不同城市的需求分布;
2.数据分析岗位不同工作经验的需求分布;
3.数据分析岗位薪资整体情况;
4.不同城市的薪资分布;
5.不同工作经验的薪资分布;
6.数据分析岗位对于学历的要求;
7.不同工作经验对于学历的要求;
8.数据分析岗位对于工作技能的要求;
9.不同工作技能对于薪资的影响
2.数据获取
本项目所使用的数据集全部来自拉勾网,主要拉勾网上的岗位信息非常完整、整洁
本次爬取信息的时候,主要获得了以下信息:
[‘companyName’(公司),‘positionName(职位)’,
‘city’(城市),‘salary’(工资), ‘education’(学历),‘workYear’(工作经验), ‘describition’(职位具体描述)]
#爬取拉钩招聘网站
import json
import time
import requests
import pandas as pd
from pyquery import PyQuery as pq
'''主要思路:发送第一个请求,得到cookie,通过传入对应的data 和 cookie,发送第二请求得到的json中获取对应的职位,要得到每个职位对应的具体描述,还需在json 中 得到showId,positonId,发送第三个请求得到对应的职位具体描述'''
#定义一个类,用于爬取拉钩数据
class lagouRequestContent():
def __init__(self):
self.url_first = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=%E6%95%B0%E6%8D%AE'
self.url_second = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
self.url_third = 'https://www.lagou.com/jobs/{}.html?show='
self.headers_first = {
#第一个header
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Connection':'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
}
self.headers_second = {
#第二个header
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '55',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=sug&fromSearch=true&suginput=%E6%95%B0%E6%8D%AE',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'X-Anit-Forge-Code': '0',
# 'Connection':'close',
'X-Anit-Forge-Token': 'None',
'X-Requested-With': 'XMLHttpRequest'
}
self.headers_third = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Connection':'close',
'Host': 'www.lagou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
#'北京','上海','深圳','广州','杭州','成都','重庆','南京','武汉','西安','佛山','东莞','昆明','珠海','无锡','厦门','长沙',
# '天津','福州','济南','大连','郑州','青岛','合肥','宁波','贵阳','长春','太原','石家庄','南昌'
self.province_city = [
'全国'
]
#第一次请求返回cookie和页数
def get_lagou_content_first(self,province_city):
#第一个请求的参数
params_first = {
'px': 'default',
'city': province_city
}
#请求原网页,以便获取cookies
response_first = requests.get(url = self.url_first,headers = self.headers_first,params = params_first)# 请求原网页
if response_first.status_code == 200:
doc = pq(response_first.content.decode('utf8'))
#得到页数
content_data = doc.find('.totalNum').text()
#得到cookie
cookie = response_first.cookies
#获取每一页的数据
for num in range(1,int(content_data)+1):
self.get_lagou_content(num,province_city,cookie)
time.sleep(10)
print('---'*10+str(num))
else:
return None
#发送请求
def get_lagou_content(self,nums,province_city,cookie):
#设置data--->form表单
if nums ==1:
data = {
'first':'true',
'pn':nums,
'kd':'数据分析'
}
else:
data = {
'first':'false',
'pn':nums,
'kd':'数据分析'
}
#设置参数
params = {
'px': 'default',
'city': province_city,
'needAddtionalResult': 'false'
}
response_second = requests.post(url = self.url_second,headers = self.headers_second,data =data,cookies = cookie,params=params )
cookie_second = response_second.cookies
if response_second.status_code == 200:
# print(response_second.status_code)
return self.parse_lagou_content(response_second,nums,cookie_second)
else:
return None
#解析请求内容
def parse_lagou_content(self,data,nums,cookie):
#将str类型的json 格式的,————>python对象
# print(type(data.content.decode('utf8')))
content_loads = json.loads(data.content.decode("utf8"))
# print(type(content_loads))#<class 'dict'>
# print(type(content_loads['content']['positionResult']['result']))#<class 'list'>
# print(type(content_loads['content']['positionResult']['result'][0]))#<class 'dict'>
# print(content_loads['content']['positionResult']['result'][0]['companyFullName'])#<class 'str'>
#设置参数
params_third = {
'show': content_loads['content']['showId']
}
return self.save_lagou_content(content_loads['content']['positionResult']['result'],nums,cookie,params_third)
#保存需要的数据
def save_lagou_content(self,value_data,nums,cookie,params_third):
lagou_data = []#里面存储形式为 [ [],[],[] ]
#遍历
for i in range(len(value_data)):
lagou_list = []
# print(value_data[i]['companyFullName'])
lagou_list.append(value_data[i]['companyFullName'])#公司名称
lagou_list.append(value_data[i]['positionName'])#职位
lagou_list.append(value_data[i]['city'])#城市
lagou_list.append(value_data[i]['salary'])#薪资
lagou_list.append(value_data[i]['companySize'])#公司大小
lagou_list.append('/'.join(value_data[i]['skillLables']))#技能要求
lagou_list.append(value_data[i]['education'])#教育
lagou_list.append(value_data[i]['workYear'])#工作经历
lagou_list.append(value_data[i]['financeStage'])#几轮融资
lagou_list.append(value_data[i]['createTime'])#创建时间
lagou_list.append(value_data[i]['resumeProcessRate'])#处理率
lagou_list.append(value_data[i]['resumeProcessDay'])#天处理率
lagou_list.append(value_data[i]['firstType'])
lagou_list.append(value_data[i]['secondType'])
lagou_list.append(value_data[i]['thirdType'])
lagou_list.append(value_data[i]['hitags'])
lagou_list.append('/'.join(value_data[i]['companyLabelList']))
lagou_list.append('/'.join(value_data[i]['positionLables']))
position_data = self.get_lagou_detail_content(cookie,params_third,value_data[i]['positionId']