一.爬取前程无忧网
爬取字段:岗位名称,薪资水平,招聘单位,工作地点,工作经验,学历要求
编写items.py
import scrapy
class Qcwy01Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field() # 职位名称
money = scrapy.Field() # 薪资水平
danwei = scrapy.Field() # 招聘单位
didian = scrapy.Field() # 工作地点
jinyan = scrapy.Field() # 工作经验
xueli = scrapy.Field() # 学历要求
zhizhe = scrapy.Field() # 岗位职责
jineng = scrapy.Field() #任职要求
编写spiders.py
import scrapy
from qcwy01.items import Qcwy01Item
class QcwySpider(scrapy.Spider):
name = 'qcwy'
allowed_domains = ['https://www.51job.com/']
start_urls = ['https://search.51job.com/list/000000,000000,0130%252C7501%252'
'C7506%252C7502,01%252C32%252C38,9,99,%2520,2,1.html?lang=c&stype'
'=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99'
'&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0'
'&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from'
'=&welfare=']
def parse(self, response):
urls = response.xpath("//*[@id='resultList']/div[@class='el']/p/span/a/@href").getall()
for url in urls:
yield scrapy.Request(url,callback=self.parse_html,dont_filter=True)
next_page = response.xpath("//div[@class='p_in']//li[last()]/a/@href").get()#详情页
if next_page:
yield scrapy.Request(next_page,callback=self.parse,dont_filter=True)
def parse_html(self,response):
item = Qcwy01Item()
try:
name = response.xpath("//div[@class='cn']/h1/text()").getall()[0]#获取字段中的第一个
money = response.xpath("//div[@class='cn']//strong/text()").get()
danwei = response.xpath("//div[@class='cn']//p[@class='cname']/a[1]/@title").get()
didian = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[0]
jinyan = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[1]#获取字段中的第二个
xueli = response.xpath("//div[@class='cn']//p[@class='msg ltype']/text()").getall()[2]#获取字段中的第三个
#岗位职责
zhizhes = response.xpath("//div[@class='bmsg job_msg inbox']//text()").getall()
zhizhe_str = ""
for zhizhe in zhizhes:
zhizhe_str+=zhizhe.strip()
#技能要求
jineng = ""
key = response.xpath("//p[@class='fp'][2]/a/text()").getall()
for i in key:
jineng+=i+" "
except:# 将上述信息保存到列表中
name = ""
money = ""
danwei = ""
didian = ""
jinyan = ""
xueli = ""
zhizhe_str = ""
jineng = ""
finally: #存储数据
item["name"] = name
item["money"] = money
item["danwei"] = danwei
item["didian"] = didian
item["jinyan"] = jinyan
item["xueli"] = xueli
item["zhizhe"] = zhizhe_str
item["jineng"] = jineng
yield item
settings.py 的编写
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.14 Safari/537.36'
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = False
MONGODB_HOST='127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'qcwy_db'
MONGODB_DOCNAME = 'qcwyTable'
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en',
'User_Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.14 Safari/537.36'
}
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
'qcwy01.pipelines.Qcwy01Pipeline': 300,
}
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
import pymongo
class Qcwy01Pipeline:
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
self.client = pymongo.MongoClient(host=host,port=port)
self.db = self.client[settings['MONGODB_DBNAME']]
self.coll = self.db[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
data = dict(item)
self.coll.insert(data)
return item
def close(self,spider):
self.client.close()
运行结果
MongoDB查看结果
第二个网站爬取的是拉勾网的数据
二.将爬取的数据存储到hdfs上,利用flume收集日志
1.创建配置文件:在flume/conf目录下
vi qc.conf
name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /opt/software/qiancheng.txt
a1.sources.r1.channels = c1
# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path =hdfs://bigdata105:9000/flume/tailout/%y-%m-%d/%H-%M/
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
flume手机日志文件
[root@bigdata105 conf]# ../bin/flume-ng agent -c conf -f qc.conf -name a1 -Dflume.root.logger=DEBUG,console
三.数据分析
1.分析“数据分析”、“大数据开发工程师”、“数据采集”等岗位的平均工资、最高工资、最低工资,并作条形图将结果展示出来
在hive中创建数据库及表
create table t_qiancheng(id string,name string,money string,danwei string,didian string,jinyan string,xueli string,zhizhe string,jineng string) row format delimited fields terminated by ',';
上传数据
load data local inpath "/opt/software/qiancheng.txt" into table t_qiancheng;
查询name,money字段
create table name_money as select name,money from t_qiancheng;
提取出岗位信息
create table m1 as select name,money from t_qiancheng01 where name like '%数据分析%';
create table m2 as select name,money from t_qiancheng01 where name like '%数据采集%';
create table m3 as select name,money from t_qiancheng01 where name like '%大数据%';
数据分析岗位工资分析
将不同单位的工资数据转换
1.获取年和月为单位的工资
create table m4 as select * from m3 where money like '%年';
create table m5 as select * from m3 where money like '%月';
2.去掉“/年”“/月”
create table m6 as select regexp_replace(money, '/月', '')as pay from m5;
create table m7 as select regexp_replace(money, '/年', '')as pay from m4;
将表中“千”“万”为单位的数据分开
create table m8 as select regexp_replace(pay, '千','')as pay from m6 where pay like '%千';
create table m9 as select regexp_replace(pay, '万','')as pay from m6 where pay like '%万';
create table m10 as select regexp_replace(pay, '万','')as pay from m7 where pay like '%万';
切分工资字段
create table m11 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from m8;
create table m12 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from m9;
create table m13 as select split(pay,'-')[0] as min ,split(pay,'-')[1] as max from m10;
删去多余的“money:”
create table m14 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from m11;
create table m15 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from m12;
create table m16 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from m13;
工资换算
create table m18 as select min*1000 as min,max*1000 as max,(min+max)/2*1000 as avg from m15;
create table m19 as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from m15;
create table m17 as select min*10000/12 as min,max*10000/12 as max,(min+max)/2*10000/12 as avg from m16;
将数据保存到一张表中
insert into m18 select * from m19;
insert into m18 select * from m17;
create table all03 as select min(min) as min,max(max) as max,avg(avg) as avg from m18;
同理可以得到“数据采集”“大数据”岗位的工资分析,得到的结果如下表
将hive中的表利用sqoop导入到mysql
在mysql中创建表
create table work1(
min double,
max double,
avg double
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
2.上传表
sh sqoop export --connect "jdbc:mysql://192.168.1.105:3306/hive_dbs?characterEncoding=UTF-8" --username root --password 123456 --table work6 --fields-terminated-by '\001' --export-dir '/user/hive/warehouse/qc.db/all01';
可视化
编写代码
# coding=gbk
import re
from pyecharts import options as opts
from pyecharts.charts import Bar
from pymongo import MongoClient
import pandas as pd
import numpy as np
post_list=["数据分析", "数据分析", "大数据"]
avg_list = [10627.073782016038, 8286.754160910472, 7071.929824561404]
max_list = [500000 , 180000, 30000 ]
min_list = [300, 300, 500]
bar = (
Bar(
init_opts=opts.InitOpts(width="1600px", height="900px"),
)
.set_global_opts(
# 设置标题信息
title_opts=opts.TitleOpts(title="行业薪资", subtitle="单位 元/月"),
# 设置X轴倾斜值
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30}),
# 显示工具箱
toolbox_opts=opts.ToolboxOpts()
)
# 关联数据
.add_xaxis(post_list) # 确定x轴上要显示的内容
# 确定y轴上要显示的内容
.add_yaxis('平均工资', avg_list)
.add_yaxis('最高工资', max_list)
.add_yaxis('最低工资', min_list)
)
bar.render("行业薪资.html")
2.分析“数据分析”、“大数据开发工程师”、“数据采集”等大数据相关岗位在成都、北京、上海、广州、深圳的岗位数,并做饼图将结果展示出来
提取出岗位信息
create table n1 as select name,didian from t_qiancheng where name like '%数据分析%';
create table n2 as select name,didian from t_qiancheng where name like '%数据采集%';
create table n3 as select name,didian from t_qiancheng where name like '%大数据%';
先分析大数据相关岗位
提取出成都、北京、上海、广州、深圳的数据
create table k1 as select * from n2 where didian like '%成都%';
create table k2 as select * from n3 where didian like '%北京%';
create table k3 as select * from n3 where didian like '%上海%';
create table k4 as select * from n3 where didian like '%广州%';
create table k5 as select * from n3 where didian like '%深圳%';
统计地区岗位个数
create table b7 as select count(*) from b1;
create table b8 as select count(*) from b2;
create table b9 as select count(*) from b3;
create table b10 as select count(*) from b4;
create table b11 as select count(*) from b5;
将数据保存到一张表中
insert into b7 select * from b8;
insert into b7 select * from b9;
insert into b7 select * from b10;
insert into b7 select * from b11;
同理,可以得到“数据采集”“数据分析”在五个地区的岗位数
将数据保存到mysql,在mysql中创建表
create table work5(
_co bigint
);
上传数据
sh sqoop export --connect jdbc:mysql://192.168.1.105:3306/hive_dbs --username root -P --table work5 --export-dir '/user/hive/warehouse/qc.db/a7' --fields-terminated-by '\t';
可视化
编写代码
# coding=gbk
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
sd = [6,10,2,9,10]
labels = ['成都','北京','上海','广州','深圳']
plt.pie(x=sd,labels=labels,autopct='%.1f%%')
plt.title("大数据")
plt.show()
3.分析大数据相关岗位1-3年工作经验的薪资水平(平均工资、最高工资、最低工资),并做出条形图展示出来
提取出大数据的岗位
create table p1 as select name,money,jinyan from t_qiancheng where name like '%大数据%';
提取出1-3年工资经验的字段
create table p2 as select name,money,jinyan from p1 where jinyan like '%1%';
create table p3 as select name,money,jinyan from p1 where jinyan like '%2%';
create table p4 as select name,money,jinyan from p1 where jinyan like '%3%';
汇总数据
insert into p2 select * from p4;
insert into p2 select * from p3;
提取出岗位名称及工资字段,并删去引号
create table p5 as select name,money from p2;
create table p6 as select regexp_replace(name, '"','')as name,regexp_replace(money, '"','')as money from p5;
得到按年,月的工资
create table p7 as select * from p6 where money like '%年';
create table p8 as select * from p6 where money like '%月';
去掉/月,/年
create table p9 as select regexp_replace(money, '/月', '')as money from p8;
create table p10 as select regexp_replace(money, '/年', '')as money from p7;
分别提取“千”“万”的工资
create table p11 as select regexp_replace(money, '千','')as money from p9 where money like '%千';
create table p12 as select regexp_replace(money, '万','')as money from p9 where money like '%万';
create table p13 as select regexp_replace(money, '万','')as money from p10 where money like '%万';
切分工资字段
create table p14 as select split(money,'-')[0] as min ,split(money,'-')[1] as max from p13;
create table p15 as select split(money,'-')[0] as min ,split(money,'-')[1] as max from p11;
create table p16 as select split(money,'-')[0] as min ,split(money,'-')[1] as max from p12;
去掉“money:”
create table p17 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from p14;
create table p18 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from p15;
create table p19 as select regexp_replace(max, 'money:','')as max,regexp_replace(min, 'money:','')as min from p16;
工资换算
create table p21 as select min*1000 as min,max*1000 as max,(min+max)/2*1000 as avg from p18;
create table p22 as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from p19;
create table p22 as select min*10000 as min,max*10000 as max,(min+max)/2*10000 as avg from p19;
汇总到一张表
insert into p20 select * from p21;
insert into p20 select * from p22;
create table all04 as select min(min) as min,max(max) as max,avg(avg) as avg from p20;
导入数据到mysql中,首先建表
create table work6(
min double,
max double,
avg double
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
导入数据
sh sqoop export --connect jdbc:mysql://192.168.1.105:3306/hive_dbs --username root -P --table work6 --export-dir '/user/hive/warehouse/qc.db/all04' --fields-terminated-by '\t';
可视化
编写代码
# coding=gbk
import re
from pyecharts import options as opts
from pyecharts.charts import Bar
from pymongo import MongoClient
import pandas as pd
import numpy as np
post_list=["大数据"]
avg_list = [16119.764167546637]
max_list = [160000]
min_list = [1000]
bar = (
Bar(
init_opts=opts.InitOpts(width="1600px", height="900px"),
)
.set_global_opts(
# 设置标题信息
title_opts=opts.TitleOpts(title="大数据薪资分析", subtitle="单位 元/月"),
# 设置X轴倾斜值
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30}),
# 显示工具箱
toolbox_opts=opts.ToolboxOpts()
)
# 关联数据
.add_xaxis(post_list) # 确定x轴上要显示的内容
# 确定y轴上要显示的内容
.add_yaxis('平均工资', avg_list)
.add_yaxis('最高工资', max_list)
.add_yaxis('最低工资', min_list)
)
bar.render("大数据薪资分析.html")
4.分析大数据相关岗位几年需求的走向趋势,并做出折线图展示出来
提取大数据岗位信息
create table w1 as select * from job where position_name like '%大数据%';
提取日期信息
create table w2 as select release_date from w1;
分别提取不同月份的信息
create table w3 as select count(*) from w2 where release_date like '%05%';
create table w4 as select count(*) from w2 where release_date like '%06%';
create table w5 as select count(*) from w2 where release_date like '%07%';
将统计的数据汇总到一张表中
insert into w3 select * from w4;
insert into w3 select * from w5;
上传数据到Mysql
创建表
create table work8(
_co bigint
);
sh sqoop export --connect jdbc:mysql://192.168.1.105:3306/hive_dbs --username root -P --table work8 --export-dir '/user/hive/warehouse/qc.db/w3' --fields-terminated-by '\t';
大数据行业未来岗位趋势可视化
# coding=gbk
from pyecharts import options as opts
from pyecharts.charts import Bar
post_list=["五月","六月","七月"]
all_list = [277,743,2307]
bar = (
Bar(
init_opts=opts.InitOpts(width="1600px", height="900px"),
)
.set_global_opts(
# 设置标题信息
title_opts=opts.TitleOpts(title="大数据岗位趋势"),
# 设置X轴倾斜值
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 30}),
# 显示工具箱
toolbox_opts=opts.ToolboxOpts()
)
# 关联数据
.add_xaxis(post_list) # 确定x轴上要显示的内容
# 确定y轴上要显示的内容
.add_yaxis('月份', all_list)
)
bar.render("大数据需求走势.html")