Hive -- 窗口分析函数

数据准备

用户购买机票的时间和价格

create table if not exists tmp.tmp_test_function_1(
user_id string,
create_time string,
flight_size int,
ticket_sal double
)
row format delimited
fields terminated by '\t';
 user_id | create_time | flight_size | ticket_sal
---------+-------------+-------------+------------
 user1   | 2015-03-10  |           2 | 1000.00   
 user1   | 2015-03-12  |           5 | 1500.00   
 user1   | 2015-03-15  |           1 | 800.00    
 user1   | 2015-04-11  |           1 | 400.00    
 user1   | 2015-04-29  |           4 | 1200.00   
 user1   | 2015-05-10  |           7 | 600.00    
 user1   | 2015-07-08  |           2 | 300.00    
 user2   | 2015-02-21  |           1 | 1000.00   
 user2   | 2015-03-10  |           1 | 200.00    
 user2   | 2015-04-14  |           6 | 450.00    
 user2   | 2015-04-16  |           4 | 1200.00   
 user2   | 2015-08-10  |           3 | 1500.00   
 user2   | 2015-08-15  |           2 | 400.00    
 user2   | 2015-11-02  |           1 | 500.00    
 user2   | 2016-03-10  |           1 | 700.00

window子句

在这里插入图片描述

  • ROWS:按照物理行数进行操作
  • RANGE:按照逻辑行数进行操作ROWS
  • PRECEDING:往前
  • FOLLOWING:往后
  • CURRENT ROW:当前行
  • UNBOUNDED:起点
  • UNBOUNDED PRECEDING:表示从前面的起点
  • UNBOUNDED FOLLOWING:表示到后面的终点
    注:window子句不适用于序列函数(如first_value)

sum,avg,max,min

OVER():指定分析函数工作的数据窗口大小,决定了聚合函数的范围,这个数据窗口大小可能会随着行的变而变化,同时可以使用以下进行限定范围。

select
  user_id,
  create_time,
  flight_size,
  ticket_sal,
  sum(flight_size) over(partition by user_id order by create_time) as fs1,
  sum(flight_size) over(partition by user_id order by create_time rows between unbounded preceding and current row) as fs2,
  sum(flight_size) over(partition by user_id order by create_time rows between 3 preceding and 1 following) as fs3,
  sum(flight_size) over(partition by user_id order by ticket_sal range between unbounded preceding and current row) as fs4
from
  tmp.tmp_test_function_1
order by
  user_id,create_time
 user_id | create_time | flight_size | ticket_sal | fs1 | fs2 | fs3 | fs4
---------+-------------+-------------+------------+-----+-----+-----+-----
 user1   | 2015-03-10  |           2 | 1000.00    |   2 |   2 |   7 |  13
 user1   | 2015-03-12  |           5 | 1500.00    |   7 |   7 |   8 |  22
 user1   | 2015-03-15  |           1 | 800.00     |   8 |   8 |   9 |  11
 user1   | 2015-04-11  |           1 | 400.00     |   9 |   9 |  13 |   3
 user1   | 2015-04-29  |           4 | 1200.00    |  13 |  13 |  18 |  17
 user1   | 2015-05-10  |           7 | 600.00     |  20 |  20 |  15 |  10
 user1   | 2015-07-08  |           2 | 300.00     |  22 |  22 |  14 |   2
 user2   | 2015-02-21  |           1 | 1000.00    |   1 |   1 |   2 |  12
 user2   | 2015-03-10  |           1 | 200.00     |   2 |   2 |   8 |   1
 user2   | 2015-04-14  |           6 | 450.00     |   8 |   8 |  12 |   9
 user2   | 2015-04-16  |           4 | 1200.00    |  12 |  12 |  15 |  16
 user2   | 2015-08-10  |           3 | 1500.00    |  15 |  15 |  16 |  19
 user2   | 2015-08-15  |           2 | 400.00     |  17 |  17 |  16 |   3
 user2   | 2015-11-02  |           1 | 500.00     |  18 |  18 |  11 |  10
 user2   | 2016-03-10  |           1 | 700.00     |  19 |  19 |   7 |  11

注:avg,max,min类似,都是统计到当前行
sum(cost) over() as sample1 所有行相加

ntile(分片函数)

用途实例:某用户购票量最多的前1/3天

select
  user_id,
  create_time,
  flight_size,
  ntile(2) over(partition by user_id order by create_time) as rn1,
  ntile(3) over(partition by user_id order by create_time) as rn2,
  ntile(4) over(partition by user_id order by create_time) as rn3
from
  tmp.tmp_test_function_1
 user_id | create_time | flight_size | rn1 | rn2 | rn3
---------+-------------+-------------+-----+-----+-----
 user1   | 2015-03-10  |           2 |   1 |   1 |   1
 user1   | 2015-03-12  |           5 |   1 |   1 |   1
 user1   | 2015-03-15  |           1 |   1 |   1 |   2
 user1   | 2015-04-11  |           1 |   1 |   2 |   2
 user1   | 2015-04-29  |           4 |   2 |   2 |   3
 user1   | 2015-05-10  |           7 |   2 |   3 |   3
 user1   | 2015-07-08  |           2 |   2 |   3 |   4
 user2   | 2015-02-21  |           1 |   1 |   1 |   1
 user2   | 2015-03-10  |           1 |   1 |   1 |   1
 user2   | 2015-04-14  |           6 |   1 |   1 |   2
 user2   | 2015-04-16  |           4 |   1 |   2 |   2
 user2   | 2015-08-10  |           3 |   2 |   2 |   3
 user2   | 2015-08-15  |           2 |   2 |   2 |   3
 user2   | 2015-11-02  |           1 |   2 |   3 |   4
 user2   | 2016-03-10  |           1 |   2 |   3 |   4

注:多余的数据从前往后添加,默认从第一行开始添加

row_number,rank,dense_rank(顺序计算)

用途实例:取某用户购票量最多的前两天

select
  user_id,
  create_time,
  flight_size,
  rank() over(partition by user_id order by flight_size desc) as rn1, 
  dense_rank() over(partition by user_id order by flight_size desc) as rn2, 
  row_number() over(partition by user_id order by flight_size desc) as rn3
from
  tmp.tmp_test_function_1;
 user_id | create_time | flight_size | rn1 | rn2 | rn3
---------+-------------+-------------+-----+-----+-----
 user1   | 2015-05-10  |           7 |   1 |   1 |   1
 user1   | 2015-03-12  |           5 |   2 |   2 |   2
 user1   | 2015-04-29  |           4 |   3 |   3 |   3
 user1   | 2015-07-08  |           2 |   4 |   4 |   4
 user1   | 2015-03-10  |           2 |   4 |   4 |   5
 user1   | 2015-03-15  |           1 |   6 |   5 |   6
 user1   | 2015-04-11  |           1 |   6 |   5 |   7
 user2   | 2015-04-14  |           6 |   1 |   1 |   1
 user2   | 2015-04-16  |           4 |   2 |   2 |   2
 user2   | 2015-08-10  |           3 |   3 |   3 |   3
 user2   | 2015-08-15  |           2 |   4 |   4 |   4
 user2   | 2015-02-21  |           1 |   5 |   5 |   5
 user2   | 2015-03-10  |           1 |   5 |   5 |   6
 user2   | 2015-11-02  |           1 |   5 |   5 |   7
 user2   | 2016-03-10  |           1 |   5 |   5 |   8

三者区别:rank值和dense_rank值会计算重复行,而rank总数不会减少;row_number就是系统计算所有行

cume_dist(小于等于当前值的行数/分组内总行数)

用途示例:查询小于等于某票价在指定购票记录中的占比

select
  user_id,
  create_time,
  ticket_sal,
  round(cume_dist() over(order by ticket_sal),2) as cd1,
  round(cume_dist() over(partition by user_id order by ticket_sal),2) as cd2
from
  tmp.tmp_test_function_1

 user_id | create_time | ticket_sal | cd1  | cd2 
---------+-------------+------------+------+------
 user1   | 2015-07-08  | 300.00     | 0.13 | 0.14
 user1   | 2015-04-11  | 400.00     | 0.27 | 0.29
 user1   | 2015-05-10  | 600.00     | 0.47 | 0.43
 user1   | 2015-03-15  | 800.00     |  0.6 | 0.57
 user1   | 2015-03-10  | 1000.00    | 0.73 | 0.71
 user1   | 2015-04-29  | 1200.00    | 0.87 | 0.86
 user1   | 2015-03-12  | 1500.00    |  1.0 |  1.0
 user2   | 2015-03-10  | 200.00     | 0.07 | 0.13
 user2   | 2015-08-15  | 400.00     | 0.27 | 0.25
 user2   | 2015-04-14  | 450.00     | 0.33 | 0.38
 user2   | 2015-11-02  | 500.00     |  0.4 |  0.5
 user2   | 2016-03-10  | 700.00     | 0.53 | 0.63
 user2   | 2015-02-21  | 1000.00    | 0.73 | 0.75
 user2   | 2015-04-16  | 1200.00    | 0.87 | 0.88
 user2   | 2015-08-10  | 1500.00    |  1.0 |  1.0

percent_rank(分组内当前行的RANK值-1/分组内总行数-1)

select
  user_id,
  create_time,
  ticket_sal, 
  rank() over(order by ticket_sal) as rank1,
  sum(1) over(partition by null) as sum1,
  round(percent_rank() over(order by ticket_sal),2) as pr1,
  rank() over(partition by user_id order by ticket_sal) as rank2,
  sum(1) over(partition by user_id) as sum2,
  round(percent_rank() over(partition by user_id order by ticket_sal),2) as pr2
from
  tmp.tmp_test_function_1
 user_id | create_time | ticket_sal | rank1 | sum1 | pr1  | rank2 | sum2 | pr2 
---------+-------------+------------+-------+------+------+-------+------+------
 user1   | 2015-07-08  | 300.00     |     2 |   15 | 0.07 |     1 |    7 |  0.0
 user1   | 2015-04-11  | 400.00     |     3 |   15 | 0.14 |     2 |    7 | 0.17
 user1   | 2015-05-10  | 600.00     |     7 |   15 | 0.43 |     3 |    7 | 0.33
 user1   | 2015-03-15  | 800.00     |     9 |   15 | 0.57 |     4 |    7 |  0.5
 user1   | 2015-03-10  | 1000.00    |    10 |   15 | 0.64 |     5 |    7 | 0.67
 user1   | 2015-04-29  | 1200.00    |    12 |   15 | 0.79 |     6 |    7 | 0.83
 user1   | 2015-03-12  | 1500.00    |    14 |   15 | 0.93 |     7 |    7 |  1.0
 user2   | 2015-03-10  | 200.00     |     1 |   15 |  0.0 |     1 |    8 |  0.0
 user2   | 2015-08-15  | 400.00     |     3 |   15 | 0.14 |     2 |    8 | 0.14
 user2   | 2015-04-14  | 450.00     |     5 |   15 | 0.29 |     3 |    8 | 0.29
 user2   | 2015-11-02  | 500.00     |     6 |   15 | 0.36 |     4 |    8 | 0.43
 user2   | 2016-03-10  | 700.00     |     8 |   15 |  0.5 |     5 |    8 | 0.57
 user2   | 2015-02-21  | 1000.00    |    10 |   15 | 0.64 |     6 |    8 | 0.71
 user2   | 2015-04-16  | 1200.00    |    12 |   15 | 0.79 |     7 |    8 | 0.86
 user2   | 2015-08-10  | 1500.00    |    14 |   15 | 0.93 |     8 |    8 |  1.0

lag, lead, first_value, last_value

LAG(用于统计窗口内往上第n行值),LEAD(用于统计窗口内往下第n行值),FIRST_VALUE(取分组内排序后,截止到当前行,第一个值),LAST_VALUE(取分组内排序后,截止到当前行,最后一个值)
用途示例:一年中销量最高和最低的月份;查询某用户上一次购票日期、下一次购票日期、第一次购票日期或者最后一次购票日期

select
  user_id,
  create_time,
  flight_size,
  ticket_sal,
  row_number() over(partition by user_id order by create_time) as rn,
  lag(create_time,2) over(partition by user_id order by create_time) as lag,
  lead(create_time,1) over(partition by user_id order by create_time) as lead,
  first_value(create_time) over(partition by user_id order by create_time) as fv,
  last_value(create_time) over(partition by user_id order by create_time) as lv
from
  tmp.tmp_test_function_1

 user_id | create_time | flight_size | ticket_sal | rn |    lag     |    lead    |     fv     |     lv    
---------+-------------+-------------+------------+----+------------+------------+------------+------------
 user1   | 2015-03-10  |           2 | 1000.00    |  1 | NULL       | 2015-03-12 | 2015-03-10 | 2015-03-10
 user1   | 2015-03-12  |           5 | 1500.00    |  2 | NULL       | 2015-03-15 | 2015-03-10 | 2015-03-12
 user1   | 2015-03-15  |           1 | 800.00     |  3 | 2015-03-10 | 2015-04-11 | 2015-03-10 | 2015-03-15
 user1   | 2015-04-11  |           1 | 400.00     |  4 | 2015-03-12 | 2015-04-29 | 2015-03-10 | 2015-04-11
 user1   | 2015-04-29  |           4 | 1200.00    |  5 | 2015-03-15 | 2015-05-10 | 2015-03-10 | 2015-04-29
 user1   | 2015-05-10  |           7 | 600.00     |  6 | 2015-04-11 | 2015-07-08 | 2015-03-10 | 2015-05-10
 user1   | 2015-07-08  |           2 | 300.00     |  7 | 2015-04-29 | NULL       | 2015-03-10 | 2015-07-08
 user2   | 2015-02-21  |           1 | 1000.00    |  1 | NULL       | 2015-03-10 | 2015-02-21 | 2015-02-21
 user2   | 2015-03-10  |           1 | 200.00     |  2 | NULL       | 2015-04-14 | 2015-02-21 | 2015-03-10
 user2   | 2015-04-14  |           6 | 450.00     |  3 | 2015-02-21 | 2015-04-16 | 2015-02-21 | 2015-04-14
 user2   | 2015-04-16  |           4 | 1200.00    |  4 | 2015-03-10 | 2015-08-10 | 2015-02-21 | 2015-04-16
 user2   | 2015-08-10  |           3 | 1500.00    |  5 | 2015-04-14 | 2015-08-15 | 2015-02-21 | 2015-08-10
 user2   | 2015-08-15  |           2 | 400.00     |  6 | 2015-04-16 | 2015-11-02 | 2015-02-21 | 2015-08-15
 user2   | 2015-11-02  |           1 | 500.00     |  7 | 2015-08-10 | 2016-03-10 | 2015-02-21 | 2015-11-02
 user2   | 2016-03-10  |           1 | 700.00     |  8 | 2015-08-15 | NULL       | 2015-02-21 | 2016-03-10

GROUPING SETS,GROUPING__ID(标识分组集合,用数字简单标识)

注:使用union操作会增加IO开销,会减少cpu和内存的开销,使用grouping sets会减少IO开销,会增加cpu和内存的消耗.
union all查询的时候需要添加为子查询,hive不支持顶层union all

select
  substr(create_time,1,7) as month,
  create_time as day,
  sum(flight_size) as flight_size,
  grouping__id
from
  tmp.tmp_test_function_1
group by substr(create_time,1,7), create_time
grouping sets(substr(create_time,1,7), create_time)
order by
  grouping__id        -- 这是两道下划线!!!
month       day         flight_size     grouping__id
-------------------------------------------------
2015-11     NULL            1               1
2015-08     NULL            5               1
2015-07     NULL            2               1
2016-03     NULL            1               1
2015-05     NULL            7               1
2015-04     NULL            15              1
2015-03     NULL            9               1
2015-02     NULL            1               1
NULL        2015-08-10      3               2
NULL        2016-03-10      1               2
NULL        2015-11-02      1               2
NULL        2015-08-15      2               2
NULL        2015-07-08      2               2
NULL        2015-05-10      7               2
NULL        2015-04-29      4               2
NULL        2015-04-16      4               2
NULL        2015-04-14      6               2
NULL        2015-04-11      1               2
NULL        2015-03-15      1               2
NULL        2015-03-12      5               2
NULL        2015-03-10      3               2
NULL        2015-02-21      1               2

相当于

SELECT substr(create_time,1,7) as month,NULL as day,sum(flight_size) AS flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL  
SELECT NULL as month,create_time as day,sum(flight_size) AS flight_size,2 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY create_time

cube(根据GROUP BY的维度的所有组合进行聚合)

select
  substr(create_time,1,7) as month,
  create_time as day,
  sum(flight_size) as flight_size,
  grouping__id
from
  tmp.tmp_test_function_1
group by substr(create_time,1,7),create_time
with cube
grouping sets(substr(create_time,1,7), create_time, (substr(create_time,1,7),create_time))
order by
  grouping__id;
month       day             flight_size grouping__id
-------------------------------------------------
NULL        NULL            41          0
2015-03     NULL            9           1
2016-03     NULL            1           1
2015-02     NULL            1           1
2015-07     NULL            2           1
2015-11     NULL            1           1
2015-04     NULL            15          1
2015-05     NULL            7           1
2015-08     NULL            5           1
NULL        2015-11-02      1           2
NULL        2015-08-15      2           2
NULL        2015-08-10      3           2
NULL        2015-07-08      2           2
NULL        2015-05-10      7           2
NULL        2015-04-29      4           2
NULL        2015-04-16      4           2
NULL        2015-04-14      6           2
NULL        2015-04-11      1           2
NULL        2015-03-15      1           2
NULL        2015-03-12      5           2
NULL        2015-03-10      3           2
NULL        2015-02-21      1           2
NULL        2016-03-10      1           2
2016-03     2016-03-10      1           3
2015-11     2015-11-02      1           3
2015-08     2015-08-15      2           3
2015-08     2015-08-10      3           3
2015-07     2015-07-08      2           3
2015-05     2015-05-10      7           3
2015-04     2015-04-29      4           3
2015-04     2015-04-16      4           3
2015-04     2015-04-14      6           3
2015-04     2015-04-11      1           3
2015-03     2015-03-15      1           3
2015-03     2015-03-12      5           3
2015-02     2015-02-21      1           3
2015-03     2015-03-10      3           3

相当于

SELECT NULL,NULL,sum(flight_size) as flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL
SELECT substr(create_time,1,7),NULL,sum(flight_size) as flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL  
SELECT NULL,create_time,sum(flight_size) as flight_size,2 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY create_time
UNION ALL  
SELECT substr(create_time,1,7),create_time,sum(flight_size) as flight_size,3 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7),create_time

注:这个是分组字段的所有组合方式,包括null null

rollup(根据左侧的维度来进行层级聚合)

select
  substr(create_time,1,7) as month,
  create_time as day,
  sum(flight_size) as flight_size,
  grouping__id
from
  tmp.tmp_test_function_1
group by substr(create_time,1,7),create_time
with rollup
grouping sets(substr(create_time,1,7), create_time, (substr(create_time,1,7),create_time))
order by
  grouping__id

month       day         flight_size grouping__id
-------------------------------------------------
NULL        NULL            41      0
2015-03     NULL            9       1
2015-08     NULL            5       1
2015-02     NULL            1       1
2016-03     NULL            1       1
2015-07     NULL            2       1
2015-04     NULL            15      1
2015-11     NULL            1       1
2015-05     NULL            7       1
2016-03     2016-03-10      1       3
2015-11     2015-11-02      1       3
2015-08     2015-08-15      2       3
2015-08     2015-08-10      3       3
2015-07     2015-07-08      2       3
2015-05     2015-05-10      7       3
2015-04     2015-04-16      4       3
2015-04     2015-04-14      6       3
2015-04     2015-04-11      1       3
2015-03     2015-03-15      1       3
2015-03     2015-03-12      5       3
2015-03     2015-03-10      3       3
2015-02     2015-02-21      1       3
2015-04     2015-04-29      4       3

相当于

SELECT NULL,NULL,sum(flight_size) as flight_size,0 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL
SELECT substr(create_time,1,7),NULL,sum(flight_size) as flight_size,1 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7)
UNION ALL   
SELECT substr(create_time,1,7),create_time,sum(flight_size) as flight_size,3 AS GROUPING__ID FROM tmp.tmp_test_function_1 GROUP BY substr(create_time,1,7),create_time

注:高维在左则进行下钻过程,低维在左则进行上钻

列转行

concat_ws+collect_set(去重)/collect_list(列转行)
用途示例:在一条记录中显示某用户某天购买的所有机票订单号

select
    user_id,
    concat_ws(',', collect_set(cast(create_time as string))) as times
from
    tmp.tmp_test_function_1
group by
    user_id
userid  times
------------------------------------------------------------------------------------------------
user1   2015-03-10,2015-03-12,2015-03-15,2015-04-11,2015-04-29,2015-05-10,2015-07-08
user2   2015-02-21,2015-03-10,2015-04-14,2015-04-16,2015-08-10,2015-08-15,2015-11-02,2016-03-10

行转列

lateral view explode(行转列)
用途示例:某一条用户记录中对应着多个常用目的地,希望统计目的地的个数,可以在explod中使用split指定分隔符

select
    user_id,
    time
from
(
select
    user_id,
    concat_ws(',', collect_set(cast(create_time as string))) as times
from
    tmp.tmp_test_function_1
group by
    user_id
)A
lateral view explode(split(times, ',')) adTable as time
user_id time
-------------------
user1   2015-03-10
user1   2015-03-12
user1   2015-03-15
user1   2015-04-11
user1   2015-04-29
user1   2015-05-10
user1   2015-07-08
user2   2015-02-21
user2   2015-03-10
user2   2015-04-14
user2   2015-04-16
user2   2015-08-10
user2   2015-08-15
user2   2015-11-02
user2   2016-03-10
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

TriumPhSK

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值