爬取拉勾热门城市“数据分析”岗位,并进行可视化分析

首先,写一个爬取岗位的爬虫,如下:

# -*- coding:utf-8 -*-
from json import JSONDecodeError
import requests
import time
import pandas as pd


# 获取存储职位信息的json对象,遍历获得公司名、福利待遇、工作地点、学历要求、工作类型、发布时间、职位名称、薪资、工作年限

companyFullName = []
job_city = []
companySize = []
positionId = []
companyId = []
positionName = []
secondType = []
positionLables = []
industryField = []
industryLables = []
salary = []
positionAdvantage = []
workYear = []
stationname = []
education = []
createTime = []
longitude = []
latitude = []
info_dict = dict()


def get_json(url, datas):
    my_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Host': 'www.lagou.com',
        'Origin': 'https://www.lagou.com',
        'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
    }
    cookies = {
        'Cookie': '你的cookie'
    }
    for x in range(3):
        content = requests.post(url=url, cookies=cookies, headers=my_headers, data=datas)
        # content.encoding = 'utf-8'
        try:
            result = content.json()
        except JSONDecodeError:
            print('=====================解析失败==============================\n',content)

        if 'content' not in result:
            print("=====================没有数据==============================")
            time.sleep(60)
            continue

        info = result['content']['positionResult']['result']
        print(info)
        if len(info) < 1:
            return False

        for job in info:
            job_city.append(job['city'])
            # print(job['city'])
            companyId.append(job['companyId'])
            companyFullName.append(job['companyFullName'])
            companySize.append(job['companySize'])
            positionId.append(job['positionId'])
            positionName.append(job['positionName'])
            secondType.append(job['secondType'])
            positionLables.append(job['positionLables'])
            industryField.append(job['industryField'])
            industryLables.append(job['industryLables'])
            salary.append(job['salary'])
            positionAdvantage.append(job['positionAdvantage'])
            workYear.append(job['workYear'])
            stationname.append(job['stationname'])
            education.append(job['education'])
            createTime.append(job['createTime'])
            longitude.append(job['longitude'])
            latitude.append(job['latitude'])
        # break
        return True


def main():
    global citys
    for city in citys:
        for x in range(1, 30):
            url = 'https://www.lagou.com/jobs/positionAjax.json?&needAddtionalResult=false'
            datas = {
                'first': True,
                'pn': x,
                'kd': '数据分析',
                'city': city
            }
            isContinue = get_json(url, datas)
            if not isContinue:
                break
            time.sleep(20)
        time.sleep(10)
    info_dict['city'] = job_city
    info_dict['companyId'] = companyId
    info_dict['companyFullName'] = companyFullName
    info_dict['companySize'] = companySize
    info_dict['positionId'] = positionId
    info_dict['positionName'] = positionName
    info_dict['secondType'] = secondType
    info_dict['positionLables'] = positionLables
    info_dict['industryField'] = industryField
    info_dict['industryLables'] = industryLables
    info_dict['salary'] = salary
    info_dict['positionAdvantage'] = positionAdvantage
    info_dict['workYear'] = workYear
    info_dict['stationname'] = stationname
    info_dict['education'] = education
    info_dict['longitude'] = longitude
    info_dict['latitude'] = latitude

    frame = pd.DataFrame(info_dict)
    frame.to_csv("LGTotal.csv")


if __name__ == '__main__':
    citys = ['北京', '上海', '广州', '深圳', '杭州', '厦门','成都','南京','武汉','西安','长沙','南京','天津','苏州']
    main()

打开保存的csv文件,部分数据如下:


通过清洗一些空数据和拆分薪资上下限等,进一步进行分析,并且可视化,可视化的工具为power bi。可视化结果如下


评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值