hadoop完全分布+hive数据分析

 

 

 

hadoop搭建基础:

Hadoop:

有道云笔记:http://note.youdao.com/noteshare?id=6afacf1ae14aef32ea32b1c361771eea&sub=832AC6E58A8F4E72B12E4C6A020A5520

hive+mysql搭建:


有道云笔记:http://note.youdao.com/noteshare?id=332ed266d16a8b50dca48d1c6e0e4624&sub=3FE5A57DCC394D73B2187610CF03AA91

flume搭建:


有道云笔记:http://note.youdao.com/noteshare?id=d6f8928a8e813980054b3a785f98de10&sub=6A004440061B4675AB75EE534BF03B03

flume基础应用:


有道云笔记:http://note.youdao.com/noteshare?id=7918e87ddc8a1f8d806be13804463a15&sub=13E6C1DCD231448AB0AF6981426F7C67

spark安装:


有道云笔记:http://note.youdao.com/noteshare?id=ef3a676af0e2f8b4be6dc3ddb2757353&sub=5D2CFF72B3964AE392B1212EF96C0871

 

一,爬取数据:

1,scrapy爬取智联招聘和中华英才网

有道云链接:http://note.youdao.com/noteshare?id=5edad3cccfc222f91bda2b5499df8806&sub=029090D747894803ABE1D6AD1E496AAD

 

二、flume传输数据:

 

flume配置文件

#name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/data/job.txt #监控data目录下
a1.sources.r1.channels = c1

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.channel = c1
a1.sinks.k1.hdfs.path =hdfs://hadoop01:9000/flume/tailout/%y-%m-%d/%H-%M/ #把数据收集到hdfs目录下
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.rollInterval = 3
a1.sinks.k1.hdfs.rollSize = 20
a1.sinks.k1.hdfs.rollCount = 5
a1.sinks.k1.hdfs.batchSize = 1
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
a1.sinks.k1.hdfs.fileType = DataStream

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100


启动flume;

flume-ng agent -c conf -f shixun.conf -n a1 -Dflume.root.logger=INFO,console

将文件放到/home/tailout ,就会将数据传到hdfs上

 

三、hive数据处理

 

有道云笔记:http://note.youdao.com/noteshare?id=f4207943f87a4f8946afd38bef6396b8&sub=0E74436F444B4742B3C1F8998D28A884

1,编写hql语句,创建数据库,创建表,处理数据

fenxi.hql

create database fenxi;
use fenxi;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by '\t';
load data local inpath '/home/sofware/hive-2.3.6/data/fenxi.txt' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;

create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;

insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;


create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;


drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;

kaifa.hql

create database kaifa;
use kaifa;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data  inpath '/lmq/trains/kaifa' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;

create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;

insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;


create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;

drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;

cji.hql

create database cji;
use cji;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data  inpath '/lmq/trains/cji' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(pay,8) as pay ,substr(site,9,2) as site ,substr(exp,15) as exp,substr(dates,10,5) as dates from shuju  group by jobtitle,pay,site,exp,dates;
create table basis  as select regexp_replace(jobtitle, '"', '') as jobtitle, regexp_replace(pay, '"', '')as pay,regexp_replace(site, '"', '')as site ,regexp_replace(exp, '"', '')as exp from shuju2;

create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2y as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2y select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table gzs as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;

insert overwrite table basis select * from basis where exp regexp '1年经验|2年经验|3年经验|1-2年经验|1-3年经验|2-3年经验|无需经验' ;
create table shuju1y as select jobtitle as jobtitle,regexp_replace(pay, '千/月', '')as pay from basis where pay like '%千/月' ;
create table shuju1yw as select jobtitle as jobtitle,regexp_replace(pay, '万/月', '')as pay from basis where pay like '%万/月' ;
create table shuju1n as select jobtitle as jobtitle,regexp_replace(pay, '万/年', '')as pay from basis where pay like '%万/年';
 create table shuju2s as select jobtitle as jobtitle,split(pay,'-')[0]*1000 as min ,split(pay,'-')[1]*1000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*1000 as avg from shuju1y;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000 as min ,split(pay,'-')[1]*10000 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000 as avg from shuju1yw;
insert into shuju2s select jobtitle as jobtitle,split(pay,'-')[0]*10000/12 as min ,split(pay,'-')[1]*10000/12 as max ,(split(pay,'-')[0]+split(pay,'-')[1])/2*10000/12 as avg from shuju1n;
create table wages as select min(min) as min,max(max) as max,avg(avg) as avg from shuju2y;
drop table shuju1y;
drop table shuju1yw;
drop table shuju1n;


create table kgwei as select site, count(site) from basis where site regexp '上海|成都|北京|深圳|广州' group by site;


drop table shuju;
drop table shuju2;
drop table shuju2s;
drop table shuju2y;

qushi.hql

create database qushi;
use qushi;
create table shuju(id string,dates string, jobtitle string, pay string,unit string , exp string ,edu string ,site string) row format delimited fields terminated by ',';
load data local inpath '/home/sofware/hive-2.3.6/data/shuju.txt' into table shuju;
create table shuju2 as select substr(jobtitle,13) as jobtitle, substr(dates,10,5) as dates from shuju  group by jobtitle,dates;
create table qushi as select  regexp_replace(jobtitle, '"', '') as jobtitle,regexp_replace(dates, '"', '') as dates from shuju2  group by jobtitle,dates;

 

1.2、启动hive,并执行hql命令

hive> source /home/sofware/hive-2.3.6/data/fenxi.hql;
hive> source /home/sofware/hive-2.3.6/data/cji.hql;
hive> source /home/sofware/hive-2.3.6/data/kaifa.hql;
hive> source /home/sofware/hive-2.3.6/data/qushi.hql;

 

 

 

执行后得到表

 

 

 

将处理后的数据使用sqoop传到mysql上

2,在MySQL上创建表

工资表

 create table gz(min varchar(200),max varchar(200),avg varchar(200));

1-3年工资

 create table wages(min varchar(200),max varchar(200),avg varchar(200));

每个城市岗位数

create table gwei(city varchar(200),counts varchar(200));

趋势

create table qushi(title varchar(200),dates varchar(200));

 

3,sqoop将hive的数据导入到mysql

sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table wages --export-dir '/user/hive/warehouse/shixun.db/wages' --input-fields-terminated-by '\001'
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table gz --export-dir '/user/hive/warehouse/fenxi.db/gzs' --input-fields-terminated-by '\001'
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table gwei --export-dir '/user/hive/warehouse/fenxi.db/kgwei' --input-fields-terminated-by '\001'
sqoop export --connect jdbc:mysql://192.168.204.133:3306/shixun --username root --password 'lmq12345L@'  --table qushi --export-dir '/user/hive/warehouse/qushi.db/qushi' --input-fields-terminated-by '\001'

 

 

 

 

四、可视化图

1,工资

2,岗位数

3,经验是1-3年的工资

4,趋势图

 

 

 

 

  • 4
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值