首先,写一个爬取岗位的爬虫,如下:
# -*- coding:utf-8 -*-
from json import JSONDecodeError
import requests
import time
import pandas as pd
# 获取存储职位信息的json对象,遍历获得公司名、福利待遇、工作地点、学历要求、工作类型、发布时间、职位名称、薪资、工作年限
companyFullName = []
job_city = []
companySize = []
positionId = []
companyId = []
positionName = []
secondType = []
positionLables = []
industryField = []
industryLables = []
salary = []
positionAdvantage = []
workYear = []
stationname = []
education = []
createTime = []
longitude = []
latitude = []
info_dict = dict()
def get_json(url, datas):
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
}
cookies = {
'Cookie': '你的cookie'
}
for x in range(3):
content = requests.post(url=url, cookies=cookies, headers=my_headers, data=datas)
# content.encoding = 'utf-8'
try:
result = content.json()
except JSONDecodeError:
print('=====================解析失败==============================\n',content)
if 'content' not in result:
print("=====================没有数据==============================")
time.sleep(60)
continue
info = result['content']['positionResult']['result']
print(info)
if len(info) < 1:
return False
for job in info:
job_city.append(job['city'])
# print(job['city'])
companyId.append(job['companyId'])
companyFullName.append(job['companyFullName'])
companySize.append(job['companySize'])
positionId.append(job['positionId'])
positionName.append(job['positionName'])
secondType.append(job['secondType'])
positionLables.append(job['positionLables'])
industryField.append(job['industryField'])
industryLables.append(job['industryLables'])
salary.append(job['salary'])
positionAdvantage.append(job['positionAdvantage'])
workYear.append(job['workYear'])
stationname.append(job['stationname'])
education.append(job['education'])
createTime.append(job['createTime'])
longitude.append(job['longitude'])
latitude.append(job['latitude'])
# break
return True
def main():
global citys
for city in citys:
for x in range(1, 30):
url = 'https://www.lagou.com/jobs/positionAjax.json?&needAddtionalResult=false'
datas = {
'first': True,
'pn': x,
'kd': '数据分析',
'city': city
}
isContinue = get_json(url, datas)
if not isContinue:
break
time.sleep(20)
time.sleep(10)
info_dict['city'] = job_city
info_dict['companyId'] = companyId
info_dict['companyFullName'] = companyFullName
info_dict['companySize'] = companySize
info_dict['positionId'] = positionId
info_dict['positionName'] = positionName
info_dict['secondType'] = secondType
info_dict['positionLables'] = positionLables
info_dict['industryField'] = industryField
info_dict['industryLables'] = industryLables
info_dict['salary'] = salary
info_dict['positionAdvantage'] = positionAdvantage
info_dict['workYear'] = workYear
info_dict['stationname'] = stationname
info_dict['education'] = education
info_dict['longitude'] = longitude
info_dict['latitude'] = latitude
frame = pd.DataFrame(info_dict)
frame.to_csv("LGTotal.csv")
if __name__ == '__main__':
citys = ['北京', '上海', '广州', '深圳', '杭州', '厦门','成都','南京','武汉','西安','长沙','南京','天津','苏州']
main()
打开保存的csv文件,部分数据如下:
通过清洗一些空数据和拆分薪资上下限等,进一步进行分析,并且可视化,可视化的工具为power bi。可视化结果如下