hive窗口函数记录

一 、 排序 row_number, rank,dense_rank

  1. row_number(): 分组内,从1到n连续
  2. rank(): 分组内,排序如下:1,2,2,3
  3. dense_rank():分组内,排序如下:1,2,2,4

准备数据:
在这里插入图片描述

create table personas_test_2
 as 
select "cookid_01" AS cookid , "2017-09-18" as create_time ,1 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,9 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,8 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,7 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,6 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,5 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,4 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,3 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,2 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,0 as pv union all
select "cookid_01" AS cookid , "2017-09-18" as create_time ,10 as pv union all
select "cookid_02" AS cookid , "2017-09-18" as create_time ,1 as pv union all
select "cookid_02" AS cookid , "2017-09-18" as create_time ,9 as pv union all
select "cookid_02" AS cookid , "2017-09-18" as create_time ,8 as pv union all
select "cookid_02" AS cookid , "2017-09-18" as create_time ,7 as pv union all
select "cookid_02" AS cookid , "2017-09-18" as create_time ,6 as pv union all
select "cookid_03" AS cookid , "2017-09-18" as create_time ,5 as pv union all
select "cookid_03" AS cookid , "2017-09-18" as create_time ,4 as pv union all
select "cookid_03" AS cookid , "2017-09-18" as create_time ,3 as pv union all
select "cookid_03" AS cookid , "2017-09-18" as create_time ,2 as pv union all
select "cookid_03" AS cookid , "2017-09-18" as create_time ,0 as pv union all
select "cookid_03" AS cookid , "2017-09-18" as create_time ,10 as pv

sql如下:

select 
   cookid
   ,create_time
   ,pv
   ,row_number() over(partition by create_time order by pv desc) as row_number_rn
   ,rank() over(partition by create_time order by pv desc)       as rank_rn
   ,dense_rank() over(partition by create_time order by pv desc) as dense_rank_rn
from personas_test_2 

结果:在这里插入图片描述

二、sum、avg、min、max、count

preceding:往前
following:往后
current row:当前行
unbounded:起点,unbounded preceding 表示从前面的起点, unbounded following:表示到后面的终点

select 
    cookid
    ,create_time
    ,pv
    ,sum(pv) over(partition by cookid) as total_0 --分组内所有数据
    ,sum(pv) over(partition by cookid order by pv asc) as total_1 --分组内第一行到当前行
    ,sum(pv) over(partition by cookid order by pv asc rows between unbounded preceding and current row) as total_2 --同上total_1
    ,sum(pv) over(partition by cookid order by pv asc rows between 1 preceding and current row)         as total_3 --分组内当前行+前一行
    ,sum(pv) over(partition by cookid order by pv asc rows between 1 preceding and 2 following)         as total_4 --分组内前一行+当前行+往下两行
    ,avg(pv) over(partition by cookid) as avg_5
    ,max(pv) over(partition by cookid) as max_5
    ,min(pv) over(partition by cookid) as min_5
    ,count(1) over(partition by cookid) as count_5
from personas_test_2

结果如下:在这里插入图片描述

三、ntile

ntile(n),用于将分组数据按照顺序切分成n片,返回当前切片值,如果切片不均匀,默认增加第一个切片的分布。ntile不支持rows between

select 
    cookid
    ,create_time
    ,pv
    ,ntile(11) over(partition by cookid) as ntile_0
    ,ntile(10) over(partition by cookid) as ntile_1
    ,ntile(5) over(partition by cookid)  as ntile_2
from personas_test_2

在这里插入图片描述
一般使用场景是取出前百分之几的数据

四、lead,lag,first_value,last_value

1. LEAD(col,n,DEFAULT) 用于统计窗口内往下第n行值: 第一个参数为列名,第二个参数为往下第n行(可选,默认为1),第三个参数为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)
2. LAG(col,n,DEFAULT) 用于统计窗口内往上第n行值: 第一个参数为列名,第二个参数为往上第n行(可选,默认为1),第三个参数为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)
3. FIRST_VALUE:取分组内排序后,截止到当前行,第一个值
4. LAST_VALUE:取分组内排序后,截止到当前行,最后一个值

select 
    cookid
    ,create_time
    ,pv
    ,lead(pv,1,999) over(partition by cookid order by pv asc)     as lead_0
    ,lag(pv,1,999) over(partition by cookid order by pv asc)      as lag_1
    ,first_value(pv) over(partition by cookid order by pv asc )   as first_2
    ,last_value(pv) over(partition by cookid order by pv asc )    as last_3
from personas_test_2

在这里插入图片描述

五、灵活应用场景:

取用户连续登录最大天数

如下数据:

create table personas_test_3
 as 
select "cookid_01" AS cookid , "2021-03-05" as log_time union all
select "cookid_01" AS cookid , "2021-03-06" as log_time union all
select "cookid_01" AS cookid , "2021-03-07" as log_time union all
select "cookid_01" AS cookid , "2021-03-11" as log_time union all
select "cookid_01" AS cookid , "2021-03-12" as log_time union all
select "cookid_01" AS cookid , "2021-03-13" as log_time union all
select "cookid_01" AS cookid , "2021-03-14" as log_time union all
select "cookid_01" AS cookid , "2021-03-15" as log_time union all
select "cookid_01" AS cookid , "2021-03-18" as log_time union all
select "cookid_01" AS cookid , "2021-03-20" as log_time union all
select "cookid_01" AS cookid , "2021-03-21" as log_time  union all
select "cookid_02" AS cookid , "2021-03-22" as log_time union all
select "cookid_02" AS cookid , "2021-03-20" as log_time union all
select "cookid_02" AS cookid , "2021-03-21" as log_time union all
select "cookid_02" AS cookid , "2021-03-10" as log_time union all
select "cookid_02" AS cookid , "2021-03-11" as log_time union all
select "cookid_03" AS cookid , "2021-03-15" as log_time union all
select "cookid_03" AS cookid , "2021-03-16" as log_time union all
select "cookid_03" AS cookid , "2021-03-17" as log_time union all
select "cookid_03" AS cookid , "2021-03-18" as log_time union all
select "cookid_03" AS cookid , "2021-03-19" as log_time union all
select "cookid_03" AS cookid , "2021-03-20" as log_time 

在这里插入图片描述
期望结果:在这里插入图片描述

select
    cookid
    ,max(num) as max_num
FROM
(
    select
        cookid
        ,log_time
        ,row_number() over(partition by cookid order by log_time asc) as rn --按照日期排序
        ,date_sub(log_time,row_number() over(partition by cookid order by log_time asc)) as sub_date --登录日期减去排序序号
        ,count(1) over(partition by cookid,date_sub(log_time,row_number() over(partition by cookid order by log_time asc))) as num  --根据上面sub_date和用户id分组计数
    from personas_test_3
)t
group by cookid
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值