前程无忧网站大数据职位信息分析可视化(源码)

前言:由于很多人问我清洗后job.csv里无数据,
经排查,感觉是这几个python包的版本原因,
在此提供我当时用的这几个包的版本(主要是pandas库):
在这里插入图片描述
一.数据采集(python爬虫)
爬虫这块不多说啦(主打数据分析),采集到的信息先转入excel表格,然后进行清洗、可视化。



#!user/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
import random,time
from lxml import html
key = '大数据'
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Encoding": "gzip, deflate",
               "Accept-Language": "en-US,en;q=0.5",
               "Connection": "keep-alive",
               "Host": "jobs.51job.com",
               "Upgrade-Insecure-Requests": "1",
               "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
def get_urls(page):
    url0 = 'http://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(page) + '.html'
    req0 = requests.get(url0, headers, timeout=10)
    req0.decoding = 'gbk'
    result = re.compile(r'class="t1 ">.*? <a target="_blank" title=".*?" href="(.*?)".*? <span class="t2">',re.S)#无re.S只在每一行内匹配
    url = re.findall(result, req0.text)
    return url
def str1(x):
    y = ''
    for x0 in x:
        y = y + x0 + ';'
    return y
def get_content(url):
    req1 = requests.get(url, headers, timeout=10)
    req1.encoding = 'gbk'
    t1 = html.fromstring(req1.text)
    try:
        job_name = t1.xpath('//div[@class="tHeader tHjob"]//h1/text()')[0].strip()
        company_name = t1.xpath('//p[@class="cname"]//a/text()')[0].strip()
        job_place = t1.xpath('//p[@class="msg ltype"]/text()')[0].strip().split('-')[0]#由于有的职位信息只有城市,此处只保留城市
        workExperience = t1.xpath('//p[@class="msg ltype"]/text()')[1].strip()
        educationBackground = t1.xpath('//p[@class="msg ltype"]/text()')[2].strip()
        require_num = t1.xpath('//p[@class="msg ltype"]/text()')[3].strip()
        date = t1.xpath('//p[@class="msg ltype"]/text()')[4].strip()
        Salary = t1.xpath('//div[@class="tHeader tHjob"]//strong/text()')[0].strip()
        company_type = t1.xpath('//div[@class="com_tag"]/p/text()')[0].strip()
        company_num = t1.xpath('//div[@class="com_tag"]/p/text()')[1].strip()
        company_business = str1(t1.xpath('//div[@class="com_tag"]/p//a/text()'))#自定义函数str()
        job_treatment = str1(re.findall(re.compile(r'<span class="sp4">(.*?)</span>', re.S), req1.text))
        job_describe = str1(t1.xpath('//div[@class="bmsg job_msg inbox"]/p/text()'))
        content = str(company_name) + ',' + str(job_name) + ',' + str(job_place) + ',' + str(workExperience) + ',' +\
               str(educationBackground) + ',' + str(require_num) + ',' + str(date) + ',' + str(Salary) + ',' + str(
            company_type) + ',' + str(company_num) + ',' + str(company_business) + ',' + str(job_treatment)+ ',' + str(job_describe) + '\n'
        file.write(content)
        return content
    except Exception as e:
        print(e)
if __name__ == '__main__':
    file = open('51job.csv', 'w', encoding='gbk')
    content0 = 'company_name,job_name,job_place,workExperience,educationBackground,require_num,date,Salary,company_type,' \
              'company_num,company_business,job_treatment,job_describe'+'\n'
    file.write(content0)
    for page in range(1,200):
        print('正在爬取第{}页信息'.format(page))
        #time.sleep(random.random() + random.randint(1, 5))
        urls = get_urls(page)
        try:
            for url in urls:
                try:
                    data = get_content(url)
                    print(data)
                    #time.sleep(random.random() + random.randint(0,1))
                except :
                    print('None')
        except Exception as e:
            print(e)
    file.close()

二.数据清洗及处理(用到pandas,numpy等库,需自行安装)
(生成文件job.csv)

#!user/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import re
data0 = pd.read_excel('data0.xlsx', encoding='gbk')
def get_salary(salary):        #自定义针对薪资信息处理的函数
    if '-' in salary:  # 针对1-2万/月或者10-20万/年的情况,包含-
        low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
        high_salary = re.findall(re.compile('(\d?\.?\d+)'), salary)[1]
        if u'万' in salary and u'年' in salary:  # 单位统一成千/月的形式
            low_salary = round(float(low_salary) / 12 * 10,2)
            high_salary = round(float(high_salary) / 12 * 10,2)
        elif u'万' in salary and u'月' in salary:
            low_salary = round(float(low_salary) * 10,2)
            high_salary = round(float(high_salary) * 10,2)
    else:  # 针对20万以上/年和100元/天这种情况,不包含-,取最低工资,没有最高工资
        low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
        high_salary = np.NAN
        if u'万' in salary and u'年' in salary:  # 单位统一成千/月的形式
            low_salary = round(float(low_salary) / 12 * 10,2)
        elif u'万' in salary and u'月' in salary:
            low_salary = round(float(low_salary) * 10,2)
        elif u'元' in salary and u'天' in salary:
            low_salary = round(float(low_salary) / 1000 * 21,2)  # 每月工作日21天
    return low_salary,high_salary
def get_experience(exp):  #自定义处理工作经验的函数
    if u'无' in exp:
        low_exp = [0]
    else:
        low_exp = re.findall(re.compile('(\d*)'), exp)
    return low_exp[0]
#数据清洗
def data_clear(df):
    df.drop(columns='job_describe',inplace=True)
    #data.drop_duplicates(subset='company_name', inplace=True)
    df.dropna(axis=0, how='any', inplace=True)   # 删除有缺失值的数据
    print('清洗有缺失值的数据后的数据shape:', df.shape)
    df = df[df.job_name.str.contains(r'.*?数据.*?')]  # 挑选含有'...数据...'的行
    print('挑选职位信息中含有"数据"后的数据shape:', df.shape)
    df.to_csv('job0.csv')
    df = pd.read_csv('job0.csv')
    df = df.iloc[:,1:]
    cols1 = [x for i, x in enumerate(df.index) if u'招' in df.loc[i, 'educationBackground']]
    df.drop(cols1, axis=0,inplace=True)
    df.to_csv('job0.csv')
    print('清洗学历信息含有招人数后的数据shape:', df.shape)
    df = pd.read_csv('job0.csv')
    df = df.iloc[:,1:]
    cols2 = [x for i, x in enumerate(df.index) if u'经验' in df.loc[i, 'educationBackground']]
    df.drop(cols2, axis=0,inplace=True)
    df.to_csv('job0.csv')
    print('清洗学历信息含有工作经验后的数据shape:', df.shape)
    df = pd.read_csv('job0.csv')
    df = df.iloc[:,1:]
    cols3 = [x for i,x in enumerate(df.index) if u'省' in df.loc[i, 'job_place']]
    df.drop(cols3, axis=0,inplace=True)
    df.to_csv('job0.csv')
    print('清洗job_place含有省份后的数据shape:',df.shape)
# 数据预处理
def data_deal():
    data = pd.read_csv('job0.csv')
    data= data.iloc[:, 1:]
    low_list, high_list, exp_list = [], [], []
    for i in range(0,data.shape[0]):
        salary_lh = get_salary(data['Salary'][i])
        low_list.append(salary_lh[0])
        high_list.append(salary_lh[1])
        exp = get_experience(data['workExperience'][i])
        exp_list.append(exp)
    col_name = data.columns.tolist()
    col_name.insert(col_name.index('Salary'),'low_salary')
    data = data.reindex(columns=col_name)
    data.rename(columns={'Salary': 'high_salary'},inplace=True)
    data['low_salary'], data['high_salary'] = low_list, high_list  # 解决高低工资,工资含有年薪等,统一月薪k/约、月
    data['workExperience'] = exp_list                              # 解决工作经验,统一取最低工作经验
    #print(data['high_salary'].isnull().sum()) #high_salary存在45个缺失值,故删除
    data.dropna(axis=0,subset=['high_salary'],inplace=True)#删除45high_salary
    data.index = np.arange(data.shape[0])      #     data.index = np.arange(len(data.index))
    data.to_csv('job.csv', encoding='utf-8')
    print('预处理后数据shape:',data.shape)
if __name__ == '__main__':
    data_clear(data0)
    data_deal()

三.数据可视化(matplotlib,pyecharts库)
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

# !user/bin/env python3
# -*- coding: utf-8 -*-
from pyecharts import Geo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyecharts import WordCloud
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams['font.sans-serif'] = 'simhei'
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('job.csv',encoding='gbk')
def edu_com(df):
    edu = df['educationBackground'].value_counts()
    y1 = [edu.index[i] for i in range(0, edu.shape[0])]
    x1 = [edu[i] for i in range(0, edu.shape[0])]
    ax = plt.figure(figsize=(12, 6))
    ax1 = ax.add_subplot(1, 2, 1)
    plt.barh(y1, x1, height=0.2, color='green')
    plt.xlabel('公司数量')
    plt.ylabel('学历要求')
    plt.title('大数据职位对学历要求条形图')
    ax2 = ax.add_subplot(1, 2, 2)
    com = df['company_type'].value_counts()
    y = [com.index[i] for i in range(0, com.shape[0])]
    x = [com[i] for i in range(0, com.shape[0])]
    plt.barh(y, x, height=0.2, color='purple')
    plt.xlabel('公司数量')
    plt.ylabel('公司类型')
    plt.title('大数据职位公司类型条形图')
    plt.savefig('大数据职位学历及公司类型条形图.png')
    plt.show()
def exp_edu_sal(df):
    d1 = df[['workExperience','low_salary','high_salary']].groupby('workExperience').mean()
    s1 = d1.index
    s11 = d1['low_salary']
    s12 = d1['high_salary']
    d2 = df[['educationBackground','low_salary','high_salary']].groupby('educationBackground').mean()
    s2 = d2.index
    s21 = d2['low_salary']
    s22 = d2['high_salary']
    p = plt.figure(figsize=(12,6))
    p1 = p.add_subplot(1,2,1)
    plt.plot(s1,s11,c='r')
    plt.plot(s1,s12,c='b')
    plt.xlabel('工作经验(年)')
    plt.ylabel('薪资(k/月)')
    plt.legend(['最低薪资','最高薪资'])
    plt.title('大数据职位工作经验对薪资影响折线图')
    p2 = p.add_subplot(1, 2, 2)
    plt.plot(s2, s21, c='r')
    plt.plot(s2, s22, c='b')
    plt.xlabel('学历')
    plt.ylabel('薪资(k/月)')
    plt.legend(['最低薪资', '最高薪资'])
    plt.title('大数据职位学历对薪资影响折线图')
    plt.savefig('大数据职位工作经验及学历对薪资影响折线图.png')
    plt.show()
def job_city_geo(df):
    job_place = df['job_place'].value_counts()[0:40]
    job_indexs = [x for i, x in enumerate(job_place.index)]
    counts = [job_place[i] for i in job_indexs]
    # list_city = []
    # for i, x in enumerate(ser.index):
    #     tuple_city = (x, ser[i])
    #     list_city.append(tuple_city)
    # list_city = list_city[0:40]
    geo = Geo("大数据职位在全国分布(前40城市排名)地理坐标系图", title_color="#fff",
              title_pos="center", width=1200,
              height=600, background_color='#404a59')
    # attr, value = geo.cast(list_city)
    geo.add("", attr=job_indexs, value=counts, visual_range=[0, 1400], visual_text_color="#fff",
            symbol_size=12, is_visualmap=True, is_roam=False)
    geo.render('大数据职位在全国分布(前40城市排名)地理坐标系图.html')
def job_name_wc(df):
    job_name = df['job_name'].value_counts()
    job_indexs = [x for i, x in enumerate(job_name.index)]
    counts = [job_name[i] for i in job_indexs]
    wordcloud = WordCloud('大数据工作名称词云图', width=1300, height=620)
    wordcloud.add("", attr=job_indexs, value=counts, word_size_range=np.arange(100))
    wordcloud.render('大数据工作名称词云图.html')
if __name__ == '__main__':
    edu_com(data)
    exp_edu_sal(data)
    job_city_geo(data)
    job_name_wc(data)
  • 7
    点赞
  • 74
    收藏
    觉得还不错? 一键收藏
  • 14
    评论
评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值