Haddoop大数据教程笔记_09_HIVEsql复杂用法举例

最新推荐文章于 2024-08-07 10:49:21 发布

Snowing_inhope

最新推荐文章于 2024-08-07 10:49:21 发布

阅读量919

点赞数

分类专栏：大数据Hadoop教程文章标签：大数据 hive hadoop 数据分析数据仓库

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/Snowing_inhope/article/details/105586537

版权

大数据Hadoop教程专栏收录该内容

13 篇文章 2 订阅

订阅专栏

目录

HIVEsql复杂用法举例

1.保存select查询结果的几种方式：

2.行转列,根据主键对某列进行合并

3.列转行,将某列数据拆分成多行

4.hive实现wordcount

5.级联查询实现累积报表(笨办法)

6-12均为9.hive 窗口分析函数

6.窗口分析函数 sum() over() ：可以实现在窗口中进行逐行累加(简单办法)

7.分组排序求topn

8.各种打序号方法

11.FIRST_VALUE 函数

12 LAST_VALUE 函数

13 hive本地模式(数据量级少时,或测试时,使用localjob提交运算更快)

14 动态分区

16 自定义函数

17.transform案例:

18.日活新增

19.维度统计

HIVEsql复杂用法举例

1.保存select查询结果的几种方式：

--1、将查询结果保存到一张新的hive表中
create table t_tmp
as
select * from t_p;


--2、将查询结果保存到一张已经存在的hive表中
insert into  table t_tmp
select * from t_p;


--3、将查询结果保存到指定的文件目录（可以是本地，也可以是hdfs）
insert overwrite local directory '/home/hadoop/test'
select * from t_p;


insert overwrite directory '/aaa/test'
select * from t_p;

2.行转列,根据主键对某列进行合并

--concat_ws(',',collect_set(column)) collect_set去重,collect_set不去重
/*stu_subject1.data:
1,zhangsan,化学
1,zhangsan,物理
1,zhangsan,数学
1,zhangsan,语文
2,lisi,化学
2,lisi,数学
2,lisi,生物
2,lisi,生理
2,lisi,卫生
3,wangwu,化学
3,wangwu,语文
3,wangwu,英语
3,wangwu,体育
3,wangwu,生物
*/
create table t_stu_subjects1(id int,name string,subject string)
row format delimited fields terminated by ',';
load data local inpath '/root/t_data/stu_subject1.data' into table t_stu_subjects1;
select id,name,concat_ws(',',collect_set(subject)) subjects from t_stu_subjects1 
group by id,name;

3.列转行,将某列数据拆分成多行

--lateral view explode()：
/*stu_subject.data:
1,zhangsan,化学:物理:数学:语文
2,lisi,化学:数学:生物:生理:卫生
3,wangwu,化学:语文:英语:体育:生物
*/
create table t_stu_subjects(id int,name string,subjects array<string>)
row format delimited fields terminated by ','
collection items terminated by ':';


load data local inpath '/root/t_data/stu_subject.data' into table t_stu_subjects;


--列拆分
select  explode(subjects) as subject from t_stu_subjects;
--列拆分后去重
select distinct subject from
(select  explode(subjects) as subject from t_stu_subjects) tmp;
--使用lateral view 表生成函数实现拆分后形成新表
select id,name,sub_final 
from t_stu_subjects lateral view explode(subjects) tmp as sub_final;

4.hive实现wordcount

select 
word,count(1) cns 
from (select explode(split(trim(line),' ')) word from t_wordcount) tmp 
group by word order by cns desc;

5.级联查询实现累积报表(笨办法)

--统计用户当月总计次数及累积到当月的总次数；
/*access_time.dat  
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
C,2015-01,20
A,2015-02,4
A,2015-02,6
C,2015-02,30
C,2015-02,10
B,2015-02,10
B,2015-02,5
A,2015-03,14
A,2015-03,6
B,2015-03,20
B,2015-03,25
C,2015-03,10
C,2015-03,20
*/
create table t_access_time(uid string,month string,times int)
row format delimited fields terminated by ','; 
load data local inpath '/root/t_data/access_time.dat' into table t_access_time;


--数据按月聚合求当月总次数(建临时表)
create table access_temp as 
select uid,month,sum(times) sum_times
from  t_access_time group by uid,month order by uid,month;


--access_temp表级联自己,并去除b表中month >= a表的数据,将b表的sum_times根据uid,month聚合sum
create table t_tmp as 
select *
from 
(select a.uid,a.month,a.sum_times,b.sum_times b_sum_times,b.month b_month
from access_temp a join access_temp b on a.uid=b.uid) temp where temp.month>=temp.b_month;


select uid,month,max(sum_times) times_month,sum(b_sum_times) times_all
from t_tmp group by uid,month;

6-12均为9.hive 窗口分析函数

6.窗口分析函数 sum() over() ：可以实现在窗口中进行逐行累加(简单办法)

--月聚合表基础上：
select uid
,month
,sum_times
,sum(sum_times) over(partition by uid order by month rows between unbounded preceding and current row) sum_all
from access_temp;
-- 注：sum() over() sum必须带参数(要做累积操作的列)

7.分组排序求topn

--row_number() over(partition by sex order by age desc)  分组排序求topn
select *
from 
(select *,row_number() over(partition by sex order by age desc) rn from t_rn) tmp 
where tmp.rn<=2;

8.各种打序号方法

/*
打序号, 不同效果看示例
row_number() over()
rank() over()
dense_rank() over()
求薪资排名中位于前1/3的人
ntile(2) over()


score   rownumber   rankover    dense_rank    ntile
89       1              1             1          1
90       2              2             2          1
90       3              2             2          1
91       4              4             3          2
92       5              5             4          2
92       6              5             4          2
93       7              7             5          3
*/

/*t_access  data:
192.168.33.3 ,http://www.edu360.cn/stu,2017-08-04 15:30:20,20170804
192.168.33.3 ,http://www.edu360.cn/teach,2017-08-04 15:35:20,20170804
192.168.33.4 ,http://www.edu360.cn/stu,2017-08-04 15:30:20,20170804
192.168.33.4 ,http://www.edu360.cn/job,2017-08-04 16:30:20,20170804
192.168.33.5 ,http://www.edu360.cn/job,2017-08-04 15:40:20,20170804
192.168.33.3 ,http://www.edu360.cn/stu,2017-08-05 15:30:20,20170805
192.168.44.3 ,http://www.edu360.cn/teach,2017-08-05 15:35:20,20170805
192.168.33.44,http://www.edu360.cn/stu,2017-08-05 15:30:20,20170805
192.168.33.46,http://www.edu360.cn/job,2017-08-05 16:30:20,20170805
192.168.33.55,http://www.edu360.cn/job,2017-08-05 15:40:20,20170805
192.168.133.3,http://www.edu360.cn/register,2017-08-06 15:30:20,20170806
192.168.111.3,http://www.edu360.cn/register,2017-08-06 15:35:20,20170806
192.168.34.44,http://www.edu360.cn/pay,2017-08-06 15:30:20,20170806
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20,20170806
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20,20170806
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20,20170806
192.168.33.25,http://www.edu360.cn/job,2017-08-06 15:40:20,20170806
192.168.33.36,http://www.edu360.cn/excersize,2017-08-06 16:30:20,20170806
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20,20170806
*/

`9.LAG函数`

--lag(access_time,num,null) over() 根据access_time分组后的数据,access_time字段下移num行,形成新的一列,第一行空的数据用‘null’替换
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lag(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;

10.LEAD函数

--lead(access_time,num,null) over() 根据access_time分组后的数据,access_time字段上移num行,形成新的一列,第一行空的数据用‘null’替换
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lead(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;

11.FIRST_VALUE 函数

--例：取每个用户访问的第一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
first_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;

12 LAST_VALUE 函数

--例：取每个用户访问的最后一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
last_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;

13 hive本地模式(数据量级少时,或测试时,使用localjob提交运算更快)

set hive.exec.mode.local.auto=true

14 动态分区

/*
(将数据中最后的字段直接映射成分区,常用于二级分区)
静态分区,动态分区同时存在是,主分区应为静态分区,二级分区为动态分区,
因为,hivesql在执行时,先编译静态分区(查询语句外的部分),并创建静态分区,
执行sql时,再根据查询语句内的字段做动态分区,如果主分区动态,
那么静态分区创建时,动态分区还未创建(hdfs目录未创建),就会报错！
*/
--（默认false）,表示开启动态分区功能
set hive.exec.dynamic.partition =true;
-- (默认strict),表示允许所有分区都是动态的，否则必须有静态分区字段
set hive.exec.dynamic.partition.mode = nonstrict;


--动态分区相关的调优参数：
--（默认100，一般可以设置大一点，比如1000）表示每个maper或reducer可以允许创建的最大动态分区个数，默认是100，超出则会报错。
set  hive.exec.max.dynamic.partitions.pernode=100 
-- 表示一个动态分区语句可以创建的最大动态分区个数，超出报错
set hive.exec.max.dynamic.partitions =1000 --(默认值) 
-- 全局可以创建的最大文件个数，超出报错。
set hive.exec.max.created.files =10000 --(默认)




/*示例part2
1,zhangsan,2020-04-16
2,lisi,2020-04-16
3,wangwu,2020-04-15
4,zhaoliu,2020-04-15
*/
create table t_partition_test(id int,name string) partitioned by(city string,dt string)
row format delimited fields terminated by ',';
create table t_partition_test2(id int,name string) partitioned by(dt string)
row format delimited fields terminated by ',';
create table t_partition_test3(id int,name string) partitioned by(city string,dt string)
row format delimited fields terminated by ',';


--load data into table 使用动态分区会报错,暂无解决办法
load data local inpath '/root/t_data/part2' into table t_partition_test partition(city='beijing',dt);
load data local inpath '/root/t_data/part2' into table t_partition_test2 partition(dt='2020-04-14');
load data local inpath '/root/t_data/part2' into table t_partition_test2 partition(dt='2020-04-15');


--使用sql insert可成功插入动态分区
insert into table t_partition_test2 partition(dt)
select id,name,dt from t_partition_test;
insert into table t_partition_test3 partition(city='beijing',dt)
select id,name,dt from t_partition_test2;

15.分桶:

/*
对于每一个表（table）或者分区， Hive可以进一步组织成桶，也就是说桶是更为细粒度的数据范围划分。Hive也是 针对某一列进行桶的组织。Hive采用对列值哈希，然后除以桶的个数求余的方式决定该条记录存放在哪个桶当中。
把表（或者分区）组织成桶（Bucket）有两个理由：
（1）获得更高的查询处理效率。桶为表加上了额外的结构，Hive 在处理有些查询时能利用这个结构。具体而言，连接两个在（包含连接列的）相同列上划分了桶的表，可以使用 Map 端连接 （Map-side join）高效的实现。比如JOIN操作。对于JOIN操作两个表有一个相同的列，如果对这两个表都进行了桶操作。那么将保存相同列值的桶进行JOIN操作就可以，可以大大较少JOIN的数据量。
--（2）使取样（sampling）更高效。在处理大规模数据集时，在开发和修改查询的阶段，如果能在数据集的一小部分数据上试运行查询，会带来很多方便。
*/


set hive.enforce.bucketing = true; 
set mapreduce.job.reduces=4;


create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno) 
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';


insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage);  报错,cluster 和 sort 不能共存


insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);


insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);


insert overwrite table stu_buck
select * from student cluster by(Sno);


drop table sc_buck;
create table sc_buck(Sno int,Cno int,Grade int)
clustered by(Sno) 
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';


insert overwrite table sc_buck
select * from sc cluster by(Sno);


select a.*,b.* from stu_buck a join sc_buck b on a.sno=b.sno;


--取样示例,  X out of Y ,Y 最好是buckets数量的整数倍
select * from stu_buck tablesample(bucket 1 out of 4 on id );

16 自定义函数

/*
有如下json数据：rating.json
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
需要导入hive中进行数据分析
*/


-- 建表映射上述数据
create table t_ratingjson(json string);


load data local inpath '/root/hivetest/rating.json' into table t_ratingjson;


/*
想把上面的原始数据变成如下形式：
1193,5,978300760,1
661,3,978302109,1
914,3,978301968,1
3408,4,978300275,1


思路：如果能够定义一个json解析函数，则很方便了
*/
create table t_rate
as
select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson;


/*解决：
hive中如何定义自己的函数：
1、先写一个java类（extends UDF,重载方法public C evaluate(A a,B b)），实现你所想要的函数的功能（传入一个json字符串和一个脚标，返回一个值）
2、将java程序打成jar包，上传到hive所在的机器
3、在hive命令行中将jar包添加到classpath ：    
    hive>add jar /root/hivetest/myjson.jar;
4、在hive命令中用命令创建一个函数叫做myjson，关联你所写的这个java类
    hive> create temporary function myjson as 'cn.edu360.hive.udf.MyJsonParser';  
*/

17.transform案例:

--1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;


--2、需要解析json数据成四个字段，插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json;


/*3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime


for line in sys.stdin:
  line = line.strip()
  movieid, rating, unixtime,userid = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([movieid, rating, str(weekday),userid])


保存文件
然后，将文件加入hive的classpath：
*/
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>
create TABLE u_data_new as
SELECT
  TRANSFORM (movieid, rate, timestring,uid)
  USING 'python weekday_mapper.py'
  AS (movieid, rate, weekday,uid)
FROM t_rating;


select distinct(weekday) from u_data_new limit 10;

18.日活新增

--web用户日志实现用户日活统计、新增统计，数据如下：
/*
day10：
192.168.33.6,hunter,2020-04-10 10:30:20,/a
192.168.33.7,hunter,2020-04-10 10:30:26,/b
192.168.33.6,jack,2020-04-10 10:30:27,/a
192.168.33.8,tom,2020-04-10 10:30:28,/b
192.168.33.9,rose,2020-04-10 10:30:30,/b
192.168.33.10,julia,2020-04-10 10:30:40,/c


day11号的数据：
192.168.33.16,hunter,2020-04-11 10:30:20,/a
192.168.33.18,jerry,2020-04-11 10:30:30,/b
192.168.33.26,jack,2020-04-11 10:30:40,/a
192.168.33.18,polo,2020-04-11 10:30:50,/b
192.168.33.39,nissan,2020-04-11 10:30:53,/b
192.168.33.39,nissan,2020-04-11 10:30:55,/a
192.168.33.39,nissan,2020-04-11 10:30:58,/c
192.168.33.20,ford,2020-04-11 10:30:54,/c


day12号的数据：
192.168.33.46,hunter,2020-04-12 10:30:21,/a
192.168.43.18,jerry,2020-04-12 10:30:22,/b
192.168.43.26,tom,2020-04-12 10:30:23,/a
192.168.53.18,bmw,2020-04-12 10:30:24,/b
192.168.63.39,benz,2020-04-12 10:30:25,/b
192.168.33.25,haval,2020-04-12 10:30:30,/c
192.168.33.10,julia,2020-04-12 10:30:40,/c
*/


create table t_user(ip string,uname string,login_time timeStamp,url string)
partitioned by(dt date)
row format delimited fields terminated by ',';


load data local inpath '/root/t_data/day10' into table t_user partition(dt='2020-04-10');
load data local inpath '/root/t_data/day11' into table t_user partition(dt='2020-04-11');
load data local inpath '/root/t_data/day12' into table t_user partition(dt='2020-04-12');


--当日活跃用户
create table t_user_active like t_user;


insert into table t_user_active partition(dt='2020-04-10') 
select ip,uname,login_time,url
from 
(select ip,uname,login_time,url
,row_number() over(partition by uname order by login_time) rn
from t_user  where dt='2020-04-10') tmp where tmp.rn = 1;


insert into table t_user_active partition(dt='2020-04-11') 
select ip,uname,login_time,url
from 
(select ip,uname,login_time,url
,row_number() over(partition by uname order by login_time) rn
from t_user  where dt='2020-04-11') tmp where tmp.rn = 1;


insert into table t_user_active partition(dt='2020-04-12') 
select ip,uname,login_time,url
from 
(select ip,uname,login_time,url
,row_number() over(partition by uname order by login_time) rn
from t_user  where dt='2020-04-12') tmp where tmp.rn = 1;


--当日新增用户
create table t_user_new like t_user;


insert into table t_user_new partition(dt='2020-04-10') 
select ip,uname,login_time,url from (
select a.*,b.uname b_uname from 
(select * 
from t_user_active where dt='2020-04-10') a 
left join (select uname
from t_user_active where dt<'2020-04-10') b  on a.uname = b.uname) tmp where tmp.b_uname is null;


insert into table t_user_new partition(dt='2020-04-11') 
select ip,uname,login_time,url from (
select a.*,b.uname b_uname from 
(select * 
from t_user_active where dt='2020-04-11') a 
left join (select uname
from t_user_active where dt<'2020-04-11') b  on a.uname = b.uname) tmp where tmp.b_uname is null;


insert into table t_user_new partition(dt='2020-04-12') 
select ip,uname,login_time,url from (
select a.*,b.uname b_uname from 
(select * 
from t_user_active where dt='2020-04-12') a 
left join (select uname
from t_user_active where dt<'2020-04-12') b  on a.uname = b.uname) tmp where tmp.b_uname is null;

19.维度统计

--建维度统计结果表 dim_user_active_day
DROP TABLE dim_user_active_day;
CREATE TABLE dim_user_active_day (
    os_name string
    ,city string
    ,release_channel string
    ,app_ver_name string
    ,cnts INT
    ) partitioned BY (
    day string
    ,dim string
    );


-- 利用多重insert语法来统计各种维度组合的日活用户数，并插入到日活维度统计表的各分区中；
FROM etl_user_active_day


INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2017-09-21'
    ,dim = '0000'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2017-09-21'


INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2017-09-21'
    ,dim = '1000'
    )
SELECT os_name
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2017-09-21'
GROUP BY (os_name)


INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2017-09-21'
    ,dim = '0100'
    )
SELECT 'all'
    ,city
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2017-09-21'
GROUP BY (city)


INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2017-09-21'
    ,dim = '0010'
    )
SELECT 'all'
    ,'all'
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2017-09-21'
GROUP BY (release_channel)


INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2017-09-21'
    ,dim = '0001'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2017-09-21'
GROUP BY (app_ver_name)


......
;

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。