杂记3_first_value窗口+表追加指标开发+中位数+ntile窗口开发

1.1、需求:获取id1的最新id2

with t as (
select 1 as id1,80 as id2,1 as time 
union all
select 1 as id1,90 as id2,2 as time 
union all 
select 1 as id1,70 as id2,3 as time 
union all
select 2 as id1,20 as id2,1 as time 
union all
select 2 as id1,30 as id2,2 as time 
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,max(id2) over(partition by id1 order by time desc) latestid2 from t;

结果却是,所以要谨慎使用窗口函数的 order by:
在这里插入图片描述

预想目标列-latestid2是,通过first_value() 窗口函数才能实现此功能:
在这里插入图片描述

with t as (
select 1 as id1,80 as id2,1 as time 
union all
select 1 as id1,90 as id2,2 as time 
union all 
select 1 as id1,70 as id2,3 as time 
union all
select 2 as id1,20 as id2,1 as time 
union all
select 2 as id1,30 as id2,2 as time 
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,first_value(id2) over(partition by id1 order by time desc) latestid2 from t;

1.2、获取id1最大的id2,order by 可以去掉:

 with t as (
select 1 as id1,80 as id2,1 as time 
union all
select 1 as id1,90 as id2,2 as time 
union all 
select 1 as id1,70 as id2,3 as time 
union all
select 2 as id1,20 as id2,1 as time 
union all
select 2 as id1,30 as id2,2 as time 
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,max(id2) over(partition by id1) latestid2 from t;

效果:
在这里插入图片描述
2、新增指标开发流程:
2.1、测试库建表+增列
注意:分区表不能通过create table as select 的方式,需要show create table 开发库表,再copy 表结构到 create table 测试库表后面,否则建立的是非分区表;
2.2、将要增加的列 与 未增加列的开发库表 对数,对数方式如2.2.1。关联方式的选择要通过理解源表业务和结果表业务:

  • 如果能让关联键为同个范围,使用full outer join;
  • 如果不能,则使用left join;

2.3、测试库运行、与 开发库表 对数,对数方式如2.3.1,如2.2.1对数完成,这一步应该可以省略;
2.4、开发库增列;
2.5、上线;

2.2.1、增加的列 如何对数,注意count(列值)不会计入null 值,因此count(1) + 过滤条件为t2.id is null 才能查出t2表关联不上t1表的数量。

select count(t2.id) from (
     select * from 开发库表 
) t1 full outer join (
     增加的列逻辑
) t2 on t1.id = t2.id
where t1.id is null or t2.id is null;

2.3.1、测试库表 如何对数

select count(t2.id) from (
        select * from 开发库表
) t1 full outer join (
    select * from 测试库表
) t2 on t1.id = t2.id
where t1.id is null or t2.id is null;

3、中位数
impala: appx_median(col)
hive: percentile、percentile_approx、ntile() 窗口函数
从以下测试例子可以看到,函数会自动对值排序,但是percentile、percentile_approx不一定会取值列表里的数据。于是摸索 ntile() 窗口函数来实现中位数功能,以下是过程记录。
3.1、impala 实现中位数功能,效果:

select appx_median(score) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--30
select appx_median(score) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--40

3.2、hive 的 percentile、percentile_approx函数,效果:

select percentile(score, 0.3) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--26
select percentile(score, array(0.5,0.5)) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--[40.0,40.0]
select percentile(score, 0.5) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--40
select percentile(score, 0.5) from (select 20 as score union all select 50 as score) t ;--35
select percentile_approx(score, 0.3) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--20
select percentile_approx(score, 0.5) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--30
select percentile_approx(score, 0.5) from (select score from (select 20 as score union all select 50 as score union all select 40 as score) t  order by score) tt;--30

3.3、hive 的 ntile() 窗口函数
3.3.1、ntile() 窗口函数,未分组的效果:

select score,ntile(3) over(order by score asc) from (select 20 as score union all select 50 as score union all select 40 as score) t;

在这里插入图片描述
选出中位数:

select score from (
select score,ntile(3) over(order by score asc) as loc from (select 20 as score union all select 50 as score union all select 40 as score) t
) tt where loc=2;

在这里插入图片描述
3个数,分成 2桶 的效果:

select score,ntile(2) over(order by score asc) as loc from (select 20 as score union all select 50 as score union all select 40 as score) t

在这里插入图片描述
3.3.2、ntile() 窗口函数,单分组 3位数 分成90桶案例:

1select id,score,ntile(90) over(partition by id order by score asc) as loc from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t;
2select score from (
select id,score,ntile(90) over(partition by id order by score asc) as loc from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) tt where loc=45;--none

效果为:
在这里插入图片描述

注意:ntile里面必须为定量,是变量的话,会报错,如:

select id,score,ntile(cnt) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t 
    group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) t2 on t1.id=t2.id
) tt

hive报错:
在这里插入图片描述
impala报错:
在这里插入图片描述
3.3.3、ntile() 窗口函数,单分组实现中位数案例:

select id,score
from (
select id,score,cnt,ntile(3) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t 
    group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) t2 on t1.id=t2.id
) tt
) ttt
where loc=round(cnt/2,0);

效果:
在这里插入图片描述
3.3.4、ntile() 窗口函数,多分组实现中位数案例,关键是ntile设置分桶数量必须 >= 分组最大数量:

select id,score
from (
select id,score,cnt,ntile(4) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score 
union all select 2 as id,80 as score union all select 2 as id,60 as score union all select 2 as id,55 as score union all select 2 as id,20 as score) t 
    group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score
union all select 2 as id,80 as score union all select 2 as id,60 as score union all select 2 as id,55 as score union all select 2 as id,20 as score) t
) t2 on t1.id=t2.id
) tt
) ttt
where loc=round(cnt/2,0);

效果:
在这里插入图片描述
4、使用count 注意,count(列) 当列值为null时,不会计入。

select count(1) from (select null as id union all select null as id) t;--2 
select count(id) from (select null as id union all select null as id) t;--0 count(null)不会计入
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值