python爬取前程无忧网站并可视化分析

本文介绍使用Scrapy框架爬取前程无忧网站的职位信息,包括岗位名称、薪资水平等详细字段。通过数据分析,对比不同城市、岗位的工作经验要求与薪资水平,利用Hive进行数据清洗,最终在MySQL中存储数据并进行可视化展示。
摘要由CSDN通过智能技术生成

一.爬取前程无忧网

爬取字段:岗位名称,薪资水平,招聘单位,工作地点,工作经验,学历要求
编写items.py

	import scrapy
class Qcwy01Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()  # 职位名称
    money = scrapy.Field()  # 薪资水平
    danwei = scrapy.Field()  # 招聘单位
    didian = scrapy.Field()  # 工作地点

    jinyan = scrapy.Field()  # 工作经验
    xueli = scrapy.Field()  # 学历要求
    zhizhe = scrapy.Field()  # 岗位职责
    jineng = scrapy.Field()   #任职要求

编写spiders.py

import scrapy
from qcwy01.items import Qcwy01Item
class QcwySpider(scrapy.Spider):
    name = 'qcwy'
    allowed_domains = ['https://www.51job.com/']
    start_urls = ['https://search.51job.com/list/000000,000000,0130%252C7501%252'
                  'C7506%252C7502,01%252C32%252C38,9,99,%2520,2,1.html?lang=c&stype'
                  '=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99'
                  '&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0'
                  '&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from'
                  '=&welfare=']
    def parse(self, response):
        urls = response.xpath("//*[@id='resultList']/div[@class='el']/p/span/a/@href").getall()
        for url in urls:
            yield scrapy.Request(url,callback=self.parse_html,dont_filter=True)
        next_page = response.xpath("//div[@class='p_in']//li[last()]/a/@href").get()#详情页
        if next_page:
            yield scrapy.Request(next_page,callback=self.parse,dont_filter=True)
    def parse_html(self,response):
        item = Qcwy01Item()
        try:
            name = response.xpath("//div[@class='cn']/h1/text()").getall()[0]#获取字段中的第一个
            money = response.xpath("//div[@class='cn']//strong/text()").get()
            danwei = response.xpath("//div[@class='cn']//p[@class='cname']/a[1]/@title").get()
            didian = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[0]
            jinyan = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[1]#获取字段中的第二个
            xueli = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[2]#获取字段中的第三个
            #岗位职责
            zhizhes = response.xpath("//div[@class='bmsg job_msg inbox']//text()").getall()
            zhizhe_str = ""
            for zhizhe in zhizhes:
                zhizhe_str+=zhizhe.strip()
             #技能要求
            jineng = ""
            key = response.xpath("//p[@class='fp'][2]/a/text()").getall()
            for i in key:
                jineng+=i+" "
        except:# 将上述信息保存到列表中
            name = ""
            money = ""
            danwei = ""
            didian = ""
            jinyan = ""
            xueli = ""
            zhizhe_str = ""
            jineng = ""
        finally: #存储数据
            item["name"] = name
            item["money"] = money
            item["danwei"] = danwei
            item["didian"] = didian
            item["jinyan"] = jinyan
            item["xueli"] = xueli
            item["zhizhe"] = zhizhe_str
            item["jineng"] = jineng
        yield item

settings.py 的编写

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.14 Safari/537.36'
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = False
MONGODB_HOST='127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'qcwy_db'
MONGODB_DOCNAME = 'qcwyTable'
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  'Accept-Language': 'en',
'User_Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.14 Safari/537.36'
}
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
    'qcwy01.pipelines.Qcwy01Pipeline': 300,
}

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.utils.project import get_project_settings

settings = get_project_settings()
import pymongo
class Qcwy01Pipeline:
    def __init__(self):
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        self.client = pymongo.MongoClient(host=host,port=port)
        self.db = self.client[settings['MONGODB_DBNAME']]
        self.coll = self.db[settings['MONGODB_DOCNAME']]
    def process_item(self, item, spider):
        data = dict(item)
        self.coll.insert(data)
        return item
    def close(self,spider):
        self.client.close()

运行结果
在这里插入图片描述
MongoDB查看结果
在这里插入图片描述第二个网站爬取的是拉勾网的数据
在这里插入图片描述

二.将爬取的数据存储到hdfs上,利用flume收集日志

1.创建配置文件:在flume/conf目录下

vi qc.conf
name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /opt/software/qiancheng.txt
a1.sources.r1.channels = c1

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path =hdfs://bigdata105:9000/flume/tailout/%y-%m-%d/%H-%M/ 
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

flume手机日志文件

[root@bigdata105 conf]# ../bin/flume-ng agent -c conf -f qc.conf -name a1 -Dflume.root.logger=DEBUG,console

在这里插入图片描述
在这里插入图片描述

三.数据分析

1.分析“数据分析”、“大数据开发工程师”、“数据采集”等岗位的平均工资、最高工资、最低工资,并作条形图将结果展示出来
在hive中创建数据库及表

create table t_qiancheng(id string,name string,money string,danwei string,didian string,jinyan string,xueli string,zhizhe string,jineng string) row format delimited fields terminated by ',';

上传数据

load data local inpath "/opt/software/qiancheng.txt" into table t_qiancheng;

在这里插入图片描述
查询name,money字段

create table name_money as select name,money from t_qiancheng;

提取出岗位信息

create table m1 as select name,money from t_qiancheng01 where name like '%数据分析%';
create table m2 as select name,money from t_qiancheng01 where name like '%数据采集%';
create table m3 as select name,money from t_qiancheng01 where name like '%大数据%';

在这里插入图片描述在这里插入图片描述在这里插入图片描述
数据分析岗位工资分析
将不同单位的工资数据转换
1.获取年和月为单位的工资

create table m4 as select * from m3 where money like '%年';
create table m5 as select * from m3 where money like '%月';

2.去掉“/年”“/月”

create table m6 as select regexp_replace(money, '/月', '')as pay from m5;
create table m7 as select regexp_replace(money, '/年', '')as pay from m4;

在这里插入图片描述
将表中“千”“万”为单位的数据分开

create table m8 as select regexp_replace(pay, '千','')as pay from  m6 where pay like '%千';
create table m9 as select regexp_replace(pay, '万','')as pay from  m6 where pay like '%万';
create table m10 as select regexp_replace(pay, '万','')as pay from m7 where pay like '%万';

在这里插入图片描述
切分工资字段

create table m11 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from m8;
create table m12 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from m9;
create table m13 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from m10;

在这里插入图片描述
删去多余的“money:”

create table m14 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from m11;
create table m15 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from m12;
create table m16 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from m13;

在这里插入图片描述
工资换算

create table m18 as select min*1000 as min,max*1000 as max,(min+max)/2*1000 as avg from m15;
create table m19 as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from m15;
create table m17 as select min*10000/12 as min,max*10000/12 as max,(min+max)/2*10000/12 as avg from m16;

将数据保存到一张表中

insert into m18 select * from m19;
insert into m18 select * from m17;
create table all03 as select min(min) as min,max(max) as max,avg(avg) as avg from m18;

在这里插入图片描述
同理可以得到“数据采集”“大数据”岗位的工资分析,得到的结果如下表

在这里插入图片描述
将hive中的表利用sqoop导入到mysql
在mysql中创建表

create table work1(
min double,
max double,
avg double
)ENGINE=InnoDB DEFAULT CHARSET=utf8;

2.上传表

sh sqoop export --connect "jdbc:mysql://192.168.1.105:3306/hive_dbs?characterEncoding=UTF-8" --username root --password 123456 --table work6 --fields-terminated-by '\001' --export-dir '/user/hive/warehouse/qc.db/all01';

在这里插入图片描述
可视化
编写代码

# coding=gbk
import re
from pyecharts import options as opts
from pyecharts.charts import Bar
from pymongo import MongoClient
import pandas as pd
import numpy as np
post_list=["数据分析", "数据分析", "大数据"]
avg_list = [10627.073782016038,  8286.754160910472, 7071.929824561404]
max_list = [500000 , 180000, 30000 ]
min_list = [300, 300, 500]
bar = (
    Bar(
        init_opts=opts.InitOpts(width="1600px", height="900px"),
    )
    .set_global_opts(
            # 设置标题信息
            title_opts=opts.TitleOpts(title="行业薪资", subtitle="单位  元/月"),
            # 设置X轴倾斜值
            xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30}),
            # 显示工具箱
            toolbox_opts=opts.ToolboxOpts()
        )
    # 关联数据
    .add_xaxis(post_list)  # 确定x轴上要显示的内容
    # 确定y轴上要显示的内容
    .add_yaxis('平均工资', avg_list)
    .add_yaxis('最高工资', max_list)
    .add_yaxis('最低工资', min_list)
)
bar.render("行业薪资.html")

在这里插入图片描述

2.分析“数据分析”、“大数据开发工程师”、“数据采集”等大数据相关岗位在成都、北京、上海、广州、深圳的岗位数,并做饼图将结果展示出来
提取出岗位信息

create table n1 as select name,didian from t_qiancheng where name like '%数据分析%';
create table n2 as select name,didian from t_qiancheng where name like '%数据采集%';
create table n3 as select name,didian from t_qiancheng where name like '%大数据%';

在这里插入图片描述先分析大数据相关岗位
提取出成都、北京、上海、广州、深圳的数据

create table k1 as select * from n2 where didian like '%成都%';
create table k2 as select * from n3 where didian like '%北京%';
create table k3 as select * from n3 where didian like '%上海%';
create table k4 as select * from n3 where didian like '%广州%';
create table k5 as select * from n3 where didian like '%深圳%';

在这里插入图片描述
统计地区岗位个数

create table b7 as select count(*) from b1;
create table b8 as select count(*) from b2;
create table b9 as select count(*) from b3;
create table b10 as select count(*) from b4;
create table b11 as select count(*) from b5;

将数据保存到一张表中

insert into b7 select * from b8;
insert into b7 select * from b9;
insert into b7 select * from b10;
insert into b7 select * from b11;

在这里插入图片描述同理,可以得到“数据采集”“数据分析”在五个地区的岗位数
在这里插入图片描述
在这里插入图片描述
将数据保存到mysql,在mysql中创建表

create table work5(
_co bigint
);

上传数据

sh sqoop export --connect jdbc:mysql://192.168.1.105:3306/hive_dbs --username root -P --table work5 --export-dir '/user/hive/warehouse/qc.db/a7' --fields-terminated-by '\t';

在这里插入图片描述
可视化
编写代码

# coding=gbk
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
sd = [6,10,2,9,10]
labels = ['成都','北京','上海','广州','深圳']
plt.pie(x=sd,labels=labels,autopct='%.1f%%')
plt.title("大数据")
plt.show()

在这里插入图片描述
在这里插入图片描述在这里插入图片描述

3.分析大数据相关岗位1-3年工作经验的薪资水平(平均工资、最高工资、最低工资),并做出条形图展示出来
提取出大数据的岗位

create table p1 as select name,money,jinyan from t_qiancheng where name like '%大数据%';

提取出1-3年工资经验的字段

create table p2 as select name,money,jinyan from p1 where jinyan like '%1%';
create table p3 as select name,money,jinyan from p1 where jinyan like '%2%';
create table p4 as select name,money,jinyan from p1 where jinyan like '%3%';

汇总数据

insert into p2 select * from p4;
insert into p2 select * from p3;

在这里插入图片描述
提取出岗位名称及工资字段,并删去引号

create table p5 as select name,money from p2;
create table p6 as select regexp_replace(name, '"','')as name,regexp_replace(money, '"','')as money from p5;

得到按年,月的工资

create table p7 as select * from p6 where money like '%年';
create table p8 as select * from p6 where money like '%月';

去掉/月,/年

create table p9 as select regexp_replace(money, '/月', '')as money from p8;
create table p10 as select regexp_replace(money, '/年', '')as money from p7;

在这里插入图片描述分别提取“千”“万”的工资

create table p11 as select regexp_replace(money, '千','')as money from  p9 where money like '%千';
create table p12 as select regexp_replace(money, '万','')as money from  p9 where money like '%万';
create table p13 as select regexp_replace(money, '万','')as money from  p10 where money like '%万';

切分工资字段

create table p14 as select split(money,'-')[0] as min ,split(money,'-')[1] as max from p13;
      create table p15 as select split(money,'-')[0] as min ,split(money,'-')[1] as max from p11;
      create table p16 as select split(money,'-')[0] as min ,split(money,'-')[1] as max from p12;

去掉“money:”

create table p17 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from p14;
create table p18 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from p15;
create table p19 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from p16;

在这里插入图片描述
工资换算

create table p21 as select min*1000 as min,max*1000 as max,(min+max)/2*1000 as avg from p18;
create table p22 as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from p19;
create table p22 as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from p19;

汇总到一张表

insert into p20 select * from p21;
insert into p20 select * from p22;
create table all04 as select min(min) as min,max(max) as max,avg(avg) as avg from p20;

在这里插入图片描述
导入数据到mysql中,首先建表

create table work6(
min double,
max double,
avg double
)ENGINE=InnoDB DEFAULT CHARSET=utf8;

导入数据

sh sqoop export --connect jdbc:mysql://192.168.1.105:3306/hive_dbs --username root -P --table work6 --export-dir '/user/hive/warehouse/qc.db/all04' --fields-terminated-by '\t';

在这里插入图片描述
可视化
编写代码

# coding=gbk
import re
from pyecharts import options as opts
from pyecharts.charts import Bar
from pymongo import MongoClient
import pandas as pd
import numpy as np
post_list=["大数据"]
avg_list = [16119.764167546637]
max_list = [160000]
min_list = [1000]
bar = (
    Bar(
        init_opts=opts.InitOpts(width="1600px", height="900px"),
    )
    .set_global_opts(
            # 设置标题信息
            title_opts=opts.TitleOpts(title="大数据薪资分析", subtitle="单位  元/月"),
            # 设置X轴倾斜值
            xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30}),
            # 显示工具箱
            toolbox_opts=opts.ToolboxOpts()
        )
    # 关联数据
    .add_xaxis(post_list)  # 确定x轴上要显示的内容
    # 确定y轴上要显示的内容
    .add_yaxis('平均工资', avg_list)
    .add_yaxis('最高工资', max_list)
    .add_yaxis('最低工资', min_list)
)
bar.render("大数据薪资分析.html")

在这里插入图片描述
4.分析大数据相关岗位几年需求的走向趋势,并做出折线图展示出来
提取大数据岗位信息

create table w1 as select * from job where position_name like '%大数据%';

提取日期信息

create table w2 as select release_date from w1;

在这里插入图片描述
分别提取不同月份的信息

create table w3 as select count(*) from w2 where release_date like '%05%';
create table w4 as select count(*) from w2 where release_date like '%06%';
create table w5 as select count(*) from w2 where release_date like '%07%';

将统计的数据汇总到一张表中

insert into w3 select * from w4;
insert into w3 select * from w5;

在这里插入图片描述
上传数据到Mysql
创建表

create table work8(
_co bigint
);
sh sqoop export --connect jdbc:mysql://192.168.1.105:3306/hive_dbs --username root -P --table work8 --export-dir '/user/hive/warehouse/qc.db/w3' --fields-terminated-by '\t';

在这里插入图片描述
大数据行业未来岗位趋势可视化

# coding=gbk

from pyecharts import options as opts
from pyecharts.charts import Bar
post_list=["五月","六月","七月"]
all_list = [277,743,2307]

bar = (
    Bar(
        init_opts=opts.InitOpts(width="1600px", height="900px"),
    )
    .set_global_opts(
            # 设置标题信息
            title_opts=opts.TitleOpts(title="大数据岗位趋势"),
            # 设置X轴倾斜值
            xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30}),
            # 显示工具箱
            toolbox_opts=opts.ToolboxOpts()
        )
    # 关联数据
    .add_xaxis(post_list)  # 确定x轴上要显示的内容
    # 确定y轴上要显示的内容
    .add_yaxis('月份', all_list)
)
bar.render("大数据需求走势.html")

在这里插入图片描述

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值