hadoop完全分布+hive数据分析

最新推荐文章于 2022-02-13 14:50:31 发布

满天飞鱼

最新推荐文章于 2022-02-13 14:50:31 发布

阅读量926

点赞数 4

分类专栏：大数据-Hapood

本文链接：https://blog.csdn.net/qq_44642379/article/details/107201638

版权

大数据-Hapood 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

hadoop搭建基础：

一，爬取数据：

1，scrapy爬取智联招聘和中华英才网

有道云链接：http://note.youdao.com/noteshare?id=5edad3cccfc222f91bda2b5499df8806&sub=029090D747894803ABE1D6AD1E496AAD

二、flume传输数据：

flume配置文件

#name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/data/job.txt #监控data目录下
a1.sources.r1.channels = c1

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path =hdfs://hadoop01:9000/flume/tailout/%y-%m-%d/%H-%M/ #把数据收集到hdfs目录下
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#生成的文件类型，默认是Sequencefile，可用DataStream，则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

启动flume；

flume-ng agent -c conf -f shixun.conf -n a1 -Dflume.root.logger=INFO,console

将文件放到/home/tailout ，就会将数据传到hdfs上

三、hive数据处理

有道云笔记：http://note.youdao.com/noteshare?id=f4207943f87a4f8946afd38bef6396b8&sub=0E74436F444B4742B3C1F8998D28A884

1，编写hql语句，创建数据库，创建表，处理数据

fenxi.hql

create database fenxi;
use fenxi;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by '\t';
load data local inpath '/home/sofware/hive-2.3.6/data/fenxi.txt' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;

create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;

insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;


create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;


drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;

kaifa.hql

create database kaifa;
use kaifa;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data  inpath '/lmq/trains/kaifa' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;

create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;

insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;


create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;

drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;

cji.hql

create database cji;
use cji;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data  inpath '/lmq/trains/cji' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;

create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;

insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;


create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;


drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;

qushi.hql

create database qushi;
use qushi;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data local inpath '/home/sofware/hive-2.3.6/data/shuju.txt' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(dates,10,5) as dates from shuju  group by jobtitle,dates;
create table qushi as select  regexp_replace(jobtitle, '"', '') as jobtitle,regexp_replace(dates, '"', '') as dates from shuju2  group by jobtitle,dates;

1.2、启动hive，并执行hql命令

hive> source /home/sofware/hive-2.3.6/data/fenxi.hql;
hive> source /home/sofware/hive-2.3.6/data/cji.hql;
hive> source /home/sofware/hive-2.3.6/data/kaifa.hql;
hive> source /home/sofware/hive-2.3.6/data/qushi.hql;

执行后得到表

将处理后的数据使用sqoop传到mysql上

2，在MySQL上创建表

工资表

 create table gz(min varchar(200),max varchar(200),avg varchar(200));

1-3年工资

 create table wages(min varchar(200),max varchar(200),avg varchar(200));

每个城市岗位数

create table gwei(city varchar(200),counts varchar(200));

趋势

create table qushi(title varchar(200),dates varchar(200));

3，sqoop将hive的数据导入到mysql

sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table wages --export-dir '/user/hive/warehouse/shixun.db/wages' --input-fields-terminated-by '\001'
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table gz --export-dir '/user/hive/warehouse/fenxi.db/gzs' --input-fields-terminated-by '\001'
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table gwei --export-dir '/user/hive/warehouse/fenxi.db/kgwei' --input-fields-terminated-by '\001'
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table qushi --export-dir '/user/hive/warehouse/qushi.db/qushi' --input-fields-terminated-by '\001'

四、可视化图

1,工资

2，岗位数

3，经验是1-3年的工资

4，趋势图

满天飞鱼

关注

4
点赞
踩
5

收藏

觉得还不错? 一键收藏
1
评论
hadoop完全分布+hive数据分析

hadoop搭建基础：Hadoop：有道云笔记：http://note.youdao.com/noteshare?id=6afacf1ae14aef32ea32b1c361771eea&sub=832AC6E58A8F4E72B12E4C6A020A5520hive+mysql搭建：有道云笔记：http://note.youdao.com/noteshare?id=332ed266d16a8b50dca48d1c6e0e4624&sub=3FE5A57DCC394D73B2
复制链接

扫一扫