数据准备
用户购买机票的时间和价格
create table if not exists tmp.tmp_test_function_1(
user_id string,
create_time string,
flight_size int,
ticket_sal double
)
row format delimited
fields terminated by '\t';
user_id | create_time | flight_size | ticket_sal
---------+-------------+-------------+------------
user1 | 2015-03-10 | 2 | 1000.00
user1 | 2015-03-12 | 5 | 1500.00
user1 | 2015-03-15 | 1 | 800.00
user1 | 2015-04-11 | 1 | 400.00
user1 | 2015-04-29 | 4 | 1200.00
user1 | 2015-05-10 | 7 | 600.00
user1 | 2015-07-08 | 2 | 300.00
user2 | 2015-02-21 | 1 | 1000.00
user2 | 2015-03-10 | 1 | 200.00
user2 | 2015-04-14 | 6 | 450.00
user2 | 2015-04-16 | 4 | 1200.00
user2 | 2015-08-10 | 3 | 1500.00
user2 | 2015-08-15 | 2 | 400.00
user2 | 2015-11-02 | 1 | 500.00
user2 | 2016-03-10 | 1 | 700.00
window子句
- ROWS:按照物理行数进行操作
- RANGE:按照逻辑行数进行操作ROWS
- PRECEDING:往前
- FOLLOWING:往后
- CURRENT ROW:当前行
- UNBOUNDED:起点
- UNBOUNDED PRECEDING:表示从前面的起点
- UNBOUNDED FOLLOWING:表示到后面的终点
注:window子句不适用于序列函数(如first_value)
sum,avg,max,min
OVER():指定分析函数工作的数据窗口大小,决定了聚合函数的范围,这个数据窗口大小可能会随着行的变而变化,同时可以使用以下进行限定范围。
select
user_id,
create_time,
flight_size,
ticket_sal,
sum(flight_size) over(partition by user_id order by create_time) as fs1,
sum(flight_size) over(partition by user_id order by create_time rows between unbounded preceding and current row) as fs2,
sum(flight_size) over(partition by user_id order by create_time rows between 3 preceding and 1 following) as fs3,
sum(flight_size) over(partition by user_id order by ticket_sal range between unbounded preceding and current row) as fs4
from
tmp.tmp_test_function_1
order by
user_id,create_time
user_id | create_time | flight_size | ticket_sal | fs1 | fs2 | fs3 | fs4
---------+-------------+-------------+------------+-----+-----+-----+-----
user1 | 2015-03-10 | 2 | 1000.00 | 2 | 2 | 7 | 13
user1 | 2015-03-12 | 5 | 1500.00 | 7 | 7 | 8 | 22
user1 | 2015-03-15 | 1 | 800.00 | 8 | 8 | 9 | 11
user1 | 2015-04-11 | 1 | 400.00 | 9 | 9 | 13 | 3
user1 | 2015-04-29 | 4 | 1200.00 | 13 | 13 | 18 | 17
user1 | 2015-05-10 | 7 | 600.00 | 20 | 20 | 15 | 10
user1 | 2015-07-08 | 2 | 300.00 | 22 | 22 | 14 | 2
user2 | 2015-02-21 | 1 | 1000.00 | 1 | 1 | 2 | 12
user2 | 2015-03-10 | 1 | 200.00 | 2 | 2 | 8 | 1
user2 | 2015-04-14 | 6 | 450.00 | 8 | 8 | 12 | 9
user2 | 2015-04-16 | 4 | 1200.00 | 12 | 12 | 15 | 16
user2 | 2015-08-10 | 3 | 1500.00 | 15 | 15 | 16 | 19
user2 | 2015-08-15 | 2 | 400.00 | 17 | 17 | 16 | 3
user2 | 2015-11-02 | 1 | 500.00 | 18 | 18 | 11 | 10
user2 | 2016-03-10 | 1 | 700.00 | 19 | 19 | 7 | 11
注:avg,max,min类似,都是统计到当前行
sum(cost) over() as sample1 所有行相加
ntile(分片函数)
用途实例:某用户购票量最多的前1/3天
select
user_id,
create_time,
flight_size,
ntile(2) over(partition by user_id order by create_time) as rn1,
ntile(3) over(partition by user_id order by create_time) as rn2,
ntile(4) over(partition by user_id order by create_time) as rn3
from
tmp.tmp_test_function_1
user_id | create_time | flight_size | rn1 | rn2 | rn3
---------+-------------+-------------+-----+-----+-----
user1 | 2015-03-10 | 2 | 1 | 1 | 1
user1 | 2015-03-12 | 5 | 1 | 1 | 1
user1 | 2015-03-15 | 1 | 1 | 1 | 2
user1 | 2015-04-11 | 1 | 1 | 2 | 2
user1 | 2015-04-29 | 4 | 2 | 2 | 3
user1 | 2015-05-10 | 7 | 2 | 3 | 3
user1 | 2015-07-08 | 2 | 2 | 3 | 4
user2 | 2015-02-21 | 1 | 1 | 1 | 1
user2 | 2015-03-10 | 1 | 1 | 1 | 1
user2 | 2015-04-14 | 6 | 1 | 1 | 2
user2 | 2015-04-16 | 4 | 1 | 2 | 2
user2 | 2015-08-10 | 3 | 2 | 2 | 3
user2 | 2015-08-15 | 2 | 2 | 2 | 3
user2 | 2015-11-02 | 1 | 2 | 3 | 4
user2 | 2016-03-10 | 1 | 2 | 3 | 4
注:多余的数据从前往后添加,默认从第一行开始添加
row_number,rank,dense_rank(顺序计算)
用途实例:取某用户购票量最多的前两天
select
user_id,
create_time,
flight_size,
rank() over(partition by user_id order by flight_size desc) as rn1,
dense_rank() over(partition by user_id order by flight_size desc) as rn2,
row_number() over(partition by user_id order by flight_size desc) as rn3
from
tmp.tmp_test_function_1;
user_id | create_time | flight_size | rn1 | rn2 | rn3
---------+-------------+-------------+-----+-----+-----
user1 | 2015-05-10 | 7 | 1 | 1 | 1
user1 | 2015-03-12 | 5 | 2 | 2 | 2
user1 | 2015-04-29 | 4 | 3 | 3 | 3
user1 | 2015-07-08 | 2 | 4 | 4 | 4
user1 | 2015-03-10 | 2 | 4 | 4 | 5
user1 | 2015-03-15 | 1 | 6 | 5 | 6
user1 | 2015-04-11 | 1 | 6 | 5 | 7
user2 | 2015-04-14 | 6 | 1 | 1 | 1
user2 | 2015-04-16 | 4 | 2 | 2 | 2
user2 | 2015-08-10 | 3 | 3 | 3 | 3
user2 | 2015-08-15 | 2 | 4 | 4 | 4
user2 | 2015-02-21 | 1 | 5 | 5 | 5
user2 | 2015-03-10 | 1 | 5 | 5 | 6
user2 | 2015-11-02 | 1 | 5 | 5 | 7
user2 | 2016-03-10 | 1 | 5 | 5 | 8
三者区别:rank值和dense_rank值会计算重复行,而rank总数不会减少;row_number就是系统计算所有行
cume_dist(小于等于当前值的行数/分组内总行数)
用途示例:查询小于等于某票价在指定购票记录中的占比
select
user_id,
create_time,
ticket_sal,
round(cume_dist() over(order by ticket_sal),2) as cd1,
round(cume_dist() over(partition by user_id order by ticket_sal),2) as cd2
from
tmp.tmp_test_function_1
user_id | create_time | ticket_sal | cd1 | cd2
---------+-------------+------------+------+------
user1 | 2015-07-08 | 300.00 | 0.13 | 0.14
user1 | 2015-04-11 | 400.00 | 0.27 | 0.29
user1 | 2015-05-10 | 600.00 | 0.47 | 0.43
user1 | 2015-03-15 | 800.00 | 0.6 | 0.57
user1 | 2015-03-10 | 1000.00 | 0.73 | 0.71
user1 | 2015-04-29 | 1200.00 | 0.87 | 0.86
user1 | 2015-03-12 | 1500.00 | 1.0 | 1.0
user2 | 2015-03-10 | 200.00 | 0.07 | 0.13
user2 | 2015-08-15 | 400.00 | 0.27 | 0.25
user2 | 2015-04-14 | 450.00 | 0.33 | 0.38
user2 | 2015-11-02 | 500.00 | 0.4 | 0.5
user2 | 2016-03-10 | 700.00 | 0.53 | 0.63
user2 | 2015-02-21 | 1000.00 | 0.73 | 0.75
user2 | 2015-04-16 | 1200.00 | 0.87 | 0.88
user2 | 2015-08-10 | 1500.00 | 1.0 | 1.0
percent_rank(分组内当前行的RANK值-1/分组内总行数-1)
select
user_id,
create_time,
ticket_sal,
rank() over(order by ticket_sal) as rank1,
sum(1) over(partition by null) as sum1,
round(percent_rank() over(order by ticket_sal),2) as pr1,
rank() over(partition by user_id order by ticket_sal) as rank2,
sum(1) over(partition by user_id) as sum2,
round(percent_rank() over(partition by user_id order by ticket_sal),2) as pr2
from
tmp.tmp_test_function_1
user_id | create_time | ticket_sal | rank1 | sum1 | pr1 | rank2 | sum2 | pr2
---------+-------------+------------+-------+------+------+-------+------+------
user1 | 2015-07-08 | 300.00 | 2 | 15 | 0.07 | 1 | 7 | 0.0
user1 | 2015-04-11 | 400.00 | 3 | 15 | 0.14 | 2 | 7 | 0.17
user1 | 2015-05-10 | 600.00 | 7 | 15 | 0.43 | 3 | 7 | 0.33
user1 | 2015-03-15 | 800.00 | 9 | 15 | 0.57 | 4 | 7 | 0.5
user1 | 2015-03-10 | 1000.00 | 10 | 15 | 0.64 | 5 | 7 | 0.67
user1 | 2015-04-29 | 1200.00 | 12 | 15 | 0.79 | 6 | 7 | 0.83
user1 | 2015-03-12 | 1500.00 | 14 | 15 | 0.93 | 7 | 7 | 1.0
user2 | 2015-03-10 | 200.00 | 1 | 15 | 0.0 | 1 | 8 | 0.0
user2 | 2015-08-15 | 400.00 | 3 | 15 | 0.14 | 2 | 8 | 0.14
user2 | 2015-04-14 | 450.00 | 5 | 15 | 0.29 | 3 | 8 | 0.29
user2 | 2015-11-02 | 500.00 | 6 | 15 | 0.36 | 4 | 8 | 0.43
user2 | 2016-03-10 | 700.00 | 8 | 15 | 0.5 | 5 | 8 | 0.57
user2 | 2015-02-21 | 1000.00 | 10 | 15 | 0.64 | 6 | 8 | 0.71
user2 | 2015-04-16 | 1200.00 | 12 | 15 | 0.79 | 7 | 8 | 0.86
user2 | 2015-08-10 | 1500.00 | 14 | 15 | 0.93 | 8 | 8 | 1.0
lag, lead, first_value, last_value
LAG(用于统计窗口内往上第n行值),LEAD(用于统计窗口内往下第n行值),FIRST_VALUE(取分组内排序后,截止到当前行,第一个值),LAST_VALUE(取分组内排序后,截止到当前行,最后一个值)
用途示例:一年中销量最高和最低的月份;查询某用户上一次购票日期、下一次购票日期、第一次购票日期或者最后一次购票日期
select
user_id,
create_time,
flight_size,
ticket_sal,
row_number() over(partition by user_id order by create_time) as rn,
lag(create_time,2) over(partition by user_id order by create_time) as lag,
lead(create_time,1) over(partition by user_id order by create_time) as lead,
first_value(create_time) over(partition by user_id order by create_time) as fv,
last_value(create_time) over(partition by user_id order by create_time) as lv
from
tmp.tmp_test_function_1
user_id | create_time | flight_size | ticket_sal | rn | lag | lead | fv | lv
---------+-------------+-------------+------------+----+------------+------------+------------+------------
user1 | 2015-03-10 | 2 | 1000.00 | 1 | NULL | 2015-03-12 | 2015-03-10 | 2015-03-10
user1 | 2015-03-12 | 5 | 1500.00 | 2 | NULL | 2015-03-15 | 2015-03-10 | 2015-03-12
user1 | 2015-03-15 | 1 | 800.00 | 3 | 2015-03-10 | 2015-04-11 | 2015-03-10 | 2015-03-15
user1 | 2015-04-11 | 1 | 400.00 | 4 | 2015-03-12 | 2015-04-29 | 2015-03-10 | 2015-04-11
user1 | 2015-04-29 | 4 | 1200.00 | 5 | 2015-03-15 | 2015-05-10 | 2015-03-10 | 2015-04-29
user1 | 2015-05-10 | 7 | 600.00 | 6 | 2015-04-11 | 2015-07-08 | 2015-03-10 | 2015-05-10
user1 | 2015-07-08 | 2 | 300.00 | 7 | 2015-04-29 | NULL | 2015-03-10 | 2015-07-08
user2 | 2015-02-21 | 1 | 1000.00 | 1 | NULL | 2015-03-10 | 2015-02-21 | 2015-02-21
user2 | 2015-03-10 | 1 | 200.00 | 2 | NULL | 2015-04-14 | 2015-02-21 | 2015-03-10
user2 | 2015-04-14 | 6 | 450.00 | 3 | 2015-02-21 | 2015-04-16 | 2015-02-21 | 2015-04-14
user2 | 2015-04-16 | 4 | 1200.00 | 4 | 2015-03-10 | 2015-08-10 | 2015-02-21 | 2015-04-16
user2 | 2015-08-10 | 3 | 1500.00 | 5 | 2015-04-14 | 2015-08-15 | 2015-02-21 | 2015-08-10
user2 | 2015-08-15 | 2 | 400.00 | 6 | 2015-04-16 | 2015-11-02 | 2015-02-21 | 2015-08-15
user2 | 2015-11-02 | 1 | 500.00 | 7 | 2015-08-10 | 2016-03-10 | 2015-02-21 | 2015-11-02
user2 | 2016-03-10 | 1 | 700.00 | 8 | 2015-08-15 | NULL | 2015-02-21 | 2016-03-10
GROUPING SETS,GROUPING__ID(标识分组集合,用数字简单标识)
注:使用union操作会增加IO开销,会减少cpu和内存的开销,使用grouping sets会减少IO开销,会增加cpu和内存的消耗.
union all查询的时候需要添加为子查询,hive不支持顶层union all
select
substr(create_time,1,7) as month,
create_time as day,
sum(flight_size) as flight_size,
grouping__id
from
tmp.tmp_test_function_1
group by substr(create_time,1,7), create_time
grouping sets(substr(create_time,1,7), create_time)
order by
grouping__id -- 这是两道下划线!!!
month day flight_size grouping__id
-------------------------------------------------
2015-11 NULL 1 1
2015-08 NULL 5 1
2015-07 NULL 2 1
2016-03 NULL 1 1
2015-05 NULL 7 1
2015-04 NULL 15 1
2015-03 NULL 9 1
2015-02 NULL 1 1
NULL 2015-08-10 3 2
NULL 2016-03-10 1 2
NULL 2015-11-02 1 2
NULL 2015-08-15 2 2
NULL 2015-07-08 2 2
NULL 2015-05-10 7 2
NULL 2015-04-29 4 2
NULL 2015-04-16 4 2
NULL 2015-04-14 6 2
NULL 2015-04-11 1 2
NULL 2015-03-15 1 2
NULL 2015-03-12 5 2
NULL 2015-03-10 3 2
NULL 2015-02-21 1 2
相当于
SELECT substr(create_time,1,7) as month,NULL as day,sum(flight_size) AS flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL
SELECT NULL as month,create_time as day,sum(flight_size) AS flight_size,2 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY create_time
cube(根据GROUP BY的维度的所有组合进行聚合)
select
substr(create_time,1,7) as month,
create_time as day,
sum(flight_size) as flight_size,
grouping__id
from
tmp.tmp_test_function_1
group by substr(create_time,1,7),create_time
with cube
grouping sets(substr(create_time,1,7), create_time, (substr(create_time,1,7),create_time))
order by
grouping__id;
month day flight_size grouping__id
-------------------------------------------------
NULL NULL 41 0
2015-03 NULL 9 1
2016-03 NULL 1 1
2015-02 NULL 1 1
2015-07 NULL 2 1
2015-11 NULL 1 1
2015-04 NULL 15 1
2015-05 NULL 7 1
2015-08 NULL 5 1
NULL 2015-11-02 1 2
NULL 2015-08-15 2 2
NULL 2015-08-10 3 2
NULL 2015-07-08 2 2
NULL 2015-05-10 7 2
NULL 2015-04-29 4 2
NULL 2015-04-16 4 2
NULL 2015-04-14 6 2
NULL 2015-04-11 1 2
NULL 2015-03-15 1 2
NULL 2015-03-12 5 2
NULL 2015-03-10 3 2
NULL 2015-02-21 1 2
NULL 2016-03-10 1 2
2016-03 2016-03-10 1 3
2015-11 2015-11-02 1 3
2015-08 2015-08-15 2 3
2015-08 2015-08-10 3 3
2015-07 2015-07-08 2 3
2015-05 2015-05-10 7 3
2015-04 2015-04-29 4 3
2015-04 2015-04-16 4 3
2015-04 2015-04-14 6 3
2015-04 2015-04-11 1 3
2015-03 2015-03-15 1 3
2015-03 2015-03-12 5 3
2015-02 2015-02-21 1 3
2015-03 2015-03-10 3 3
相当于
SELECT NULL,NULL,sum(flight_size) as flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL
SELECT substr(create_time,1,7),NULL,sum(flight_size) as flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL
SELECT NULL,create_time,sum(flight_size) as flight_size,2 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY create_time
UNION ALL
SELECT substr(create_time,1,7),create_time,sum(flight_size) as flight_size,3 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7),create_time
注:这个是分组字段的所有组合方式,包括null null
rollup(根据左侧的维度来进行层级聚合)
select
substr(create_time,1,7) as month,
create_time as day,
sum(flight_size) as flight_size,
grouping__id
from
tmp.tmp_test_function_1
group by substr(create_time,1,7),create_time
with rollup
grouping sets(substr(create_time,1,7), create_time, (substr(create_time,1,7),create_time))
order by
grouping__id
month day flight_size grouping__id
-------------------------------------------------
NULL NULL 41 0
2015-03 NULL 9 1
2015-08 NULL 5 1
2015-02 NULL 1 1
2016-03 NULL 1 1
2015-07 NULL 2 1
2015-04 NULL 15 1
2015-11 NULL 1 1
2015-05 NULL 7 1
2016-03 2016-03-10 1 3
2015-11 2015-11-02 1 3
2015-08 2015-08-15 2 3
2015-08 2015-08-10 3 3
2015-07 2015-07-08 2 3
2015-05 2015-05-10 7 3
2015-04 2015-04-16 4 3
2015-04 2015-04-14 6 3
2015-04 2015-04-11 1 3
2015-03 2015-03-15 1 3
2015-03 2015-03-12 5 3
2015-03 2015-03-10 3 3
2015-02 2015-02-21 1 3
2015-04 2015-04-29 4 3
相当于
SELECT NULL,NULL,sum(flight_size) as flight_size,0 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL
SELECT substr(create_time,1,7),NULL,sum(flight_size) as flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL
SELECT substr(create_time,1,7),create_time,sum(flight_size) as flight_size,3 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7),create_time
注:高维在左则进行下钻过程,低维在左则进行上钻
列转行
concat_ws+collect_set(去重)/collect_list(列转行)
用途示例:在一条记录中显示某用户某天购买的所有机票订单号
select
user_id,
concat_ws(',', collect_set(cast(create_time as string))) as times
from
tmp.tmp_test_function_1
group by
user_id
userid times
------------------------------------------------------------------------------------------------
user1 2015-03-10,2015-03-12,2015-03-15,2015-04-11,2015-04-29,2015-05-10,2015-07-08
user2 2015-02-21,2015-03-10,2015-04-14,2015-04-16,2015-08-10,2015-08-15,2015-11-02,2016-03-10
行转列
lateral view explode(行转列)
用途示例:某一条用户记录中对应着多个常用目的地,希望统计目的地的个数,可以在explod中使用split指定分隔符
select
user_id,
time
from
(
select
user_id,
concat_ws(',', collect_set(cast(create_time as string))) as times
from
tmp.tmp_test_function_1
group by
user_id
)A
lateral view explode(split(times, ',')) adTable as time
user_id time
-------------------
user1 2015-03-10
user1 2015-03-12
user1 2015-03-15
user1 2015-04-11
user1 2015-04-29
user1 2015-05-10
user1 2015-07-08
user2 2015-02-21
user2 2015-03-10
user2 2015-04-14
user2 2015-04-16
user2 2015-08-10
user2 2015-08-15
user2 2015-11-02
user2 2016-03-10