综合案例BOSS直聘数据爬虫、数据处理、数据可视化(直接上代码)

先看结果

在这里插入图片描述

代码

在运行之前需要做几个准备:
1、去boss里面抓包china.json
2、下载和安装Chromedriver
3、ip被封了自己找高匿ip换
不会就百度

爬虫部分

#!/usr/bin/env python3
from bs4 import BeautifulSoup
from selenium import webdriver
import json
import os
import urllib.parse as parse

#解析网页
def getResponseContent(url):
    proxy = "223.244.166.186:3828"  #自主添加高匿ip,绕过反爬
    chromeOptions = webdriver.ChromeOptions()   #模拟浏览器进入,同上
    chromeOptions.add_argument('--proxy-server=http://' + proxy)
    driver = webdriver.Chrome()
    driver.binary_location = r'C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe'  #谷歌浏览器地址
    driver.get(url)
    html = driver.page_source
    driver.quit()  #关闭浏览器,减少内存消耗
    return html

#获取城市id
def getCityid(CITY):
    # 读取json文件
    with open(os.path.join(os.path.dirname(__file__), './china.json'), 'r', encoding='utf-8') as  f:
        china_list = json.loads(f.read())['zpData']
    city_dict = {}
    for province in china_list['cityList']:
        for city in province['subLevelModelList']:
            city_dict[city['name']] = str(city['code'])
    # 生成全国城市json文件
    with open(os.path.join(os.path.dirname(__file__), 'city.json'), 'w', encoding='utf-8') as  f:
        f.write(str(city_dict).replace('\'', '"'))
    # 查询城市id
    with open(os.path.join(os.path.dirname(__file__), 'city.json'), 'r', encoding='utf-8') as f:
        city_dict = json.loads(f.read())
    return city_dict[CITY]

#根据输入的搜索信息获取各页网址
def getUrls(urlHead, urlEnd):
    name = input('请输入需要查询的职位:')
    city = input('请输入需要查询的城市:')
    name = parse.quote(name)
    cityid = getCityid(city)
    htmlurl = urlHead + cityid + urlEnd + name
    # 获取所有页数的url,boss中未登录用户只能获取14页数据
    url_ls = []
    for i in range(1, 14):#15
        url = htmlurl + '&page=' + str(i)
        url_ls.append(url)
        print('添加该页:%s 到URLS列表' % url)
    return url_ls

#解析网址内容,爬取有用信息,写入列表
def get_info(urls):
    items = []
    for url in urls:
        print("**********************")
        htmlContent = getResponseContent(url)
        items = []
        soup = BeautifulSoup(htmlContent, 'html.parser')
        tags = soup.find_all('div',attrs={'class': 'job-primary'})
        for tag in tags:
            item = {}
            item['position'] = tag.find('span',attrs={'class':'job-name'}).get_text()
            tags1 = tag.find('div', attrs={'class': 'info-company'})
            item['name'] = tags1.find('h3', attrs={'class': 'name'}).get_text()
            item['industry'] = tags1.find('a', attrs={'class': 'false-link'}).get_text()
            item['scale'] = tags1.find('div', attrs={'class': 'company-text'}).get_text().split('\n')[2]
            item['hr'] = tag.find('h3',attrs={'class':'name'}).get_text()
            item['address'] = tag.find('span', attrs={'class': 'job-area'}).get_text()
            item['keyword'] = ','.join(tag.find('div', attrs={'class': 'tags'}).get_text().split('\n'))
            item['welfare'] = tag.find('div', attrs={'class': 'info-desc'}).get_text()
            item['experience'] = tag.find('div', attrs={'class': 'job-limit clearfix'}).get_text().split('\n')[2]
            item['wages'] = tag.find('span', attrs={'class': 'red'}).get_text()
            item['Link'] = 'https://www.zhipin.com/'+tag.find('div', attrs={'class': 'primary-box'}).get('href')
            items.append(item)
            print('获取公司名为:<<%s>>' % (item['name']))
    return items

def write_json(items):
    # 写进json文件
    with open('BOSS直聘.json', 'w', encoding='utf-8') as f:
        json.dump(items, f, ensure_ascii=False, indent=4)

if __name__ == '__main__':
    url_head = 'https://www.zhipin.com/c'
    url_end = '/?query='
    urls = getUrls(url_head, url_end)
    info = get_info(urls)
    write_json(info)

数据处理和可视化部分

import pandas as pd

df = pd.read_json('BOSS直聘.json', encoding="utf-8")
#df.to_csv('BOSS直聘.csv',index=None,mode='a')

#划分上海个各区
address = df['address'].str.split('·',1).str[1]  #剔除没有标明区的数据
address = address.str.split().apply(pd.Series)
address = address.apply(lambda x:x.value_counts()).fillna('0').astype(int)
address = address.apply(lambda x:x.sum(),axis=1)
address = pd.DataFrame(address,columns=['counts'])   #转换为dataframe
address.reset_index(level=0, inplace=True)  #添加索引
#print(address)


#删除多余信息,提取公司规模信息
scale = df.apply(lambda x: x['scale'].split(x['industry'],1),axis=1)
scale= scale.apply(lambda x: x[1:])
scale= scale.apply(lambda x: ''.join(x))
#print(scale)

#把公司类型和人数分离
#公司类型
scale2 = df['scale'].str.split('1',1).str[1]
type = scale.str.split('1',1).str[0]
type = type.str.split('5',1).str[0]
type1 = type   #用于薪资分析
type = type.apply(pd.Series)
type = type.apply(lambda x:x.value_counts()).astype(int)
type = type.apply(lambda x:x.sum(),axis=1)
type = pd.DataFrame(type,columns=['counts'])
type.reset_index(level=0, inplace=True)
#print(type)

#公司人数
scale2= scale2.apply(pd.Series)
scale2= scale2.where((scale2.applymap(
    lambda x: True if str(x) != 'nan' else False)), '500-999人')
scale2= scale2.where((scale2.applymap(
    lambda x: True if str(x) != '000-9999人' else False)), '1000-9999人')
scale2= scale2.where((scale2.applymap(
    lambda x: True if str(x) != '0000人以上' else False)), '10000人以上')
scale3 = scale2  #用于薪资分析
scale2= scale2.apply(lambda x:x.value_counts()).astype(int)
scale2= scale2.apply(lambda x:x.sum(),axis=1)
scale2 = pd.DataFrame(scale2,columns=['counts'])
scale2.reset_index(level=0, inplace=True)
#print(scale2)


#求职技能关键字重塑
keyword = df['keyword'].str.split(',').apply(pd.Series)
keyword = keyword.apply(lambda x:x.value_counts()).fillna('0').astype(int)
keyword = keyword.apply(lambda x:x.sum(),axis=1)
keyword = pd.DataFrame(keyword,columns=['counts'])
keyword = keyword.drop([''])  #剔除无用数据
keyword = keyword.sort_values('counts',ascending=False)  #排序
keyword = keyword.head(25)
keyword.reset_index(level=0, inplace=True)
#print(keyword)


#对学历和经验进行分割
#学历
df['education'] = df['experience'].str[-2:]
education = df['education'].apply(pd.Series)
education1 = education #用于薪资分析
education = education.apply(lambda x:x.value_counts()).fillna('0').astype(int)
education = education.apply(lambda x:x.sum(),axis=1)
education = pd.DataFrame(education,columns=['counts'])
education.reset_index(level=0, inplace=True)
#print(education)

#工作经验
experience = df.apply(lambda x: x['experience'].split(x['education'],1),axis=1)
experience = experience.str[0:]
experience = experience.apply(lambda x: ''.join(x)).apply(pd.Series)
experience1 = experience #用于薪资分析
experience = experience.apply(lambda x:x.value_counts()).fillna('0').astype(int)
experience = experience.apply(lambda x:x.sum(),axis=1)
experience = pd.DataFrame(experience,columns=['counts'])
experience.reset_index(level=0, inplace=True)
#剔除无用数据
experience =experience[~experience['counts'].isin([1,2])]
#print(experience)


#薪资数据处理分析
wages = df['wages'].str.split('·',1).str[0]
wages = wages.apply(pd.Series)
wages = wages.apply(lambda x:x.value_counts()).fillna('0').astype(int)
wages = wages.apply(lambda x:x.sum(),axis=1)
wages = pd.DataFrame(wages,columns=['counts'])
wages.reset_index(level=0, inplace=True)
#print(wages)

#获取最高和最低工资
wages1 = pd.DataFrame()
wages1['min'] = wages['index'].str.split('-',1).str[0]
wages1['max'] = wages['index'].str.split('-',1).str[1].str[:-1]
wages1['type'] = type1
print(wages1)
#剔除工资为单位为天的数据
aaa = wages1['min'].apply(pd.Series).astype(int)
aaa = aaa.where((aaa.applymap(
    lambda x: True if x < 100 else False)), 'N')
wages1['min'] = aaa
wages1 =wages1[~wages1['min'].isin(['N'])]
#print(wages1)

#不同企业类型薪资情况
min1 = wages1.groupby(by=['type']).agg({'min':min})
wages1['max'] = wages1['max'].apply(pd.Series).astype(int)
max1 = wages1.groupby(by=['type']).agg({'max':max})
min1.reset_index(level=0, inplace=True)
max1.reset_index(level=0, inplace=True)
#print(wages1)

#不同企业规模薪资情况
wages1['scale'] = scale3
min2 = wages1.groupby(by=['scale']).agg({'min':min})
max2 = wages1.groupby(by=['scale']).agg({'max':max})
min2.reset_index(level=0, inplace=True)
max2.reset_index(level=0, inplace=True)
#print(max2)

#不同学历要求的薪资情况
wages1['education'] = education1
min3 = wages1.groupby(by=['education']).agg({'min':min})
max3 = wages1.groupby(by=['education']).agg({'max':max})
min3.reset_index(level=0, inplace=True)
max3.reset_index(level=0, inplace=True)

#不同经验要求的薪水情况
wages1['experience'] = experience1
min4 = wages1.groupby(by=['experience']).agg({'min':min})
max4 = wages1.groupby(by=['experience']).agg({'max':max})
min4.reset_index(level=0, inplace=True)
max4.reset_index(level=0, inplace=True)


#公司、岗位及所在区关系
df['add'] = 1  #增加一个全为1的列用于计数
position = df.groupby(by=['position']).agg({'add':sum})
position = position.sort_values('add',ascending=False)
#print(position)
name = df.groupby(by=['name']).agg({'add':sum})
name = name.sort_values('add',ascending=False)
#print(name)

#公司行业统计
industry = df['industry'].apply(pd.Series)
industry = industry.apply(lambda x:x.value_counts()).fillna('0').astype(int)
industry = industry.apply(lambda x:x.sum(),axis=1)
industry = pd.DataFrame(industry,columns=['counts'])
industry.reset_index(level=0, inplace=True)
#print(industry)

#各行业招聘岗位需求数
industry1 = df.groupby(by=['industry']).agg({'position':'count'})
industry1 = industry1.sort_values('position',ascending=False)
industry1.reset_index(level=0, inplace=True)
industry1 = industry1.head(6).sort_values('position',ascending=True)
#print(industry1)

#各行业公司需求数
industry2 = df.drop_duplicates(subset=[ 'name' ], keep ='first')
industry2 = industry2.groupby(by=['industry']).agg({'name':'count'})
industry2 = industry2.sort_values('name',ascending=False)
industry2.reset_index(level=0, inplace=True)
industry2 = industry2.head(6).sort_values('name',ascending=True)
#print(industry2)




from pyecharts import options as opts
from pyecharts.charts import *

def echarts_industry_wc():
    wc = WordCloud()
    wc.add("",
           list(zip(industry['index'],industry['counts'])),
           word_size_range=[20,100],
           shape='diamond')
    wc.set_global_opts(title_opts=opts.TitleOpts(title="数据分析主要行业"))
    wc.render('./html/echarts_industry_wc.html')
echarts_industry_wc()

#主要行业招聘需求
def echarts_industry_bar():
    bar = Bar()
    bar.add_xaxis(industry2['industry'].tolist())
    bar.add_yaxis("招聘岗位", industry1['position'].tolist())
    bar.add_yaxis("招聘公司", industry2['name'].tolist())
    bar.set_global_opts(title_opts=opts.TitleOpts(title="主要行业招聘需求",
                                                  subtitle="数据来源:zhipin.com"),
                        # 添加分割线
                        xaxis_opts=opts.AxisOpts(
                            splitline_opts=opts.SplitLineOpts(is_show=True)))
    # 系列配置项
    bar.set_series_opts(label_opts=opts.LabelOpts(position="right"))
    bar.reversal_axis()  # 翻转xy轴
    bar.render("./html/echarts_industry_bar.html")
#echarts_industry_bar()


city = ["崇明区", "浦东新区", "宝山区", "嘉定区", "闵行区", "青浦区",
        "松江区", "奉贤区", "徐汇区", "金山区", "长宁区", "杨浦区",
        "静安区", "普陀区", "虹口区", "黄浦区"]
#由于被剔除没有标明区的数据占总数据一半,此处按比例分配给各区
address['counts'] = address['counts'] * 2
def echarts_address_map():
    mp = Map()
    mp.add("map", list(zip(city, address['counts'].tolist())), "上海",
          label_opts=opts.LabelOpts(is_show=False))
    mp.set_global_opts(
        title_opts=opts.TitleOpts(title="上海数据分析招聘岗位数目",
                                  subtitle="数据来源:zhipin.com",pos_left="20%"),
        visualmap_opts=opts.VisualMapOpts(min_=0,max_=80,is_piecewise=True),
        legend_opts=opts.LegendOpts(is_show=False))
    mp.render("./html/echarts_address_map.html")
#echarts_address_map()

def echarts_address_pie():
    pie = Pie()
    pie.add("",
            list(zip(city, address['counts'].tolist())),
            center=["40%", "50%"],  # 饼图圆心位置
            radius=["0%", "50%"],  #内径外径
            )
    pie.set_global_opts(
        legend_opts=opts.LegendOpts(is_show=False))  # 不显示图例
    pie.set_series_opts(opts.LabelOpts(formatter="{b}:{c}:{d}%"))  # 饼图标签显示格式
    pie.render('./html/echarts_address_pie.html')
#echarts_address_pie()

def echarts_type_pie():
    pie = Pie(init_opts=opts.InitOpts(bg_color="#f5f9fc"))
    pie.add("",
            list(zip(type['index'].tolist(), type['counts'].tolist())),
            center=["25%", "50%"],  # 饼图圆心位置
            radius=["0%", "45%"],  #内径外径
            )
    pie.add("",
            list(zip(scale2['index'].tolist(), scale2['counts'].tolist())),
            center=["70%", "50%"],  # 饼图圆心位置
            radius=["0%", "45%"],  #内径外径
            )
    pie.set_global_opts(
        title_opts=opts.TitleOpts(title="招聘公司情况",
                                  subtitle="数据来源:zhipin.com"),
        legend_opts=opts.LegendOpts(is_show=False))  # 不显示图例
    pie.set_series_opts(opts.LabelOpts(formatter="{b}:{c}"))  # 饼图标签显示格式
    pie.render('./html/echarts_type_pie.html')
#echarts_type_pie()

def echarts_wages_len1():

    line = Line(init_opts=opts.InitOpts(bg_color="#f5f9fc"))
    line.add_xaxis(max1['type'].tolist())
    line.add_yaxis("最高工资", max1['max'].tolist())
    line.add_yaxis("最低工资", min1['min'].tolist())
    line.set_global_opts(title_opts=opts.TitleOpts(title="不同企业类型薪水情况"),
                         yaxis_opts=opts.AxisOpts(name="单位:K/月"))
    scatter = Scatter()
    scatter.add_xaxis(wages1['type'].tolist())
    scatter.add_yaxis("", wages1['min'].tolist(), symbol_size=7)  # 散点大小
    scatter.add_yaxis("", wages1['max'].tolist(), symbol_size=7)
    scatter.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    line.overlap(scatter)
    line.render("./html/line_wages_len1.html")
#echarts_wages_len1()

def echarts_wages_len2():
    line = Line(init_opts=opts.InitOpts(bg_color="#f5f9fc"))
    line.add_xaxis(max2['scale'].tolist())
    line.add_yaxis("最高工资", max2['max'].tolist())
    line.add_yaxis("最低工资", min2['min'].tolist())
    line.set_global_opts(title_opts=opts.TitleOpts(title="不同企业规模薪水情况"),
                         yaxis_opts=opts.AxisOpts(name="单位:K/月"))
    scatter = Scatter()
    scatter.add_xaxis(wages1['scale'].tolist())
    scatter.add_yaxis("", wages1['min'].tolist(), symbol_size=7)  # 散点大小
    scatter.add_yaxis("", wages1['max'].tolist(), symbol_size=7)
    scatter.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    line.overlap(scatter)
    line.render("./html/line_wages_len2.html")
#echarts_wages_len2()


def echarts_wages_len3():
    line = Line(init_opts=opts.InitOpts(bg_color="#f5f9fc"))
    line.add_xaxis(max3['education'].tolist())
    line.add_yaxis("最高工资", max3['max'].tolist())
    line.add_yaxis("最低工资", min3['min'].tolist())
    line.set_global_opts(title_opts=opts.TitleOpts(title="不同学历要求的薪水情况"),
                         yaxis_opts=opts.AxisOpts(name="单位:K/月"))
    scatter = Scatter()
    scatter.add_xaxis(wages1['education'].tolist())
    scatter.add_yaxis("", wages1['min'].tolist(), symbol_size=7)  # 散点大小
    scatter.add_yaxis("", wages1['max'].tolist(), symbol_size=7)
    scatter.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    line.overlap(scatter)
    line.render("./html/line_wages_len3.html")
#echarts_wages_len3()

def echarts_wages_len4():
    line = Line(init_opts=opts.InitOpts(bg_color="#f5f9fc"))
    line.add_xaxis(max4['experience'].tolist())
    line.add_yaxis("最高工资", max4['max'].tolist())
    line.add_yaxis("最低工资", min4['min'].tolist())
    line.set_global_opts(title_opts=opts.TitleOpts(title="不同经验要求薪水情况"),
                         yaxis_opts=opts.AxisOpts(name="单位:K/月"))
    scatter = Scatter()
    scatter.add_xaxis(wages1['experience'].tolist())
    scatter.add_yaxis("", wages1['min'].tolist(), symbol_size=7)  # 散点大小
    scatter.add_yaxis("", wages1['max'].tolist(), symbol_size=7)
    scatter.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    line.overlap(scatter)
    line.render("./html/line_wages_len4.html")
#echarts_wages_len4()

def echarts_education_pie():
    pie = Pie(init_opts=opts.InitOpts(bg_color="#f5f9fc"))
    pie.add("",
         list(zip(education['index'].tolist(), education['counts'].tolist())),
         center=["25%", "50%"],  # 饼图圆心位置
         radius=["0%", "45%"],  # 内径外径
         )
    pie.add("",
            list(zip(experience['index'].tolist(), experience['counts'].tolist())),
            center=["70%", "50%"],  # 饼图圆心位置
            radius=["0%", "45%"],  # 内径外径
            )

    pie.set_global_opts(
        title_opts=opts.TitleOpts(title="招聘要求",
                                  subtitle="数据来源:zhipin.com"),
        legend_opts=opts.LegendOpts(is_show=False))  # 不显示图例
    pie.set_series_opts(opts.LabelOpts(formatter="{b}:{c}"))  # 饼图标签显示格式
    pie.render('./html/echarts_education_pie.html')
#echarts_education_pie()

def echarts_keyword_bar():
    bar = Bar()
    bar.add_xaxis(keyword['index'].tolist())
    bar.add_yaxis("",keyword['counts'].tolist())
    bar.set_global_opts(title_opts=opts.TitleOpts(title="数据分析师技能需求",
                                                  subtitle="数据来源:zhipin.com",pos_left="40%"),
                        legend_opts=opts.LegendOpts(is_show=False),  # 不显示图例
                        xaxis_opts=opts.AxisOpts(
                            axislabel_opts=opts.LabelOpts(rotate=90)),
                        yaxis_opts=opts.AxisOpts(
                            splitline_opts=opts.SplitLineOpts(is_show=True),
                        name='提及次数'))
    bar.render("./html/echarts_keyword_bar.html")
#echarts_keyword_bar()

写在最后

个人觉得代码有点丑长,很多地方都可以优化改进,水平有限,仅供参考,各位轻喷!

  • 6
    点赞
  • 127
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值