杂记3_first_value窗口+表追加指标开发+中位数+ntile窗口开发

最新推荐文章于 2023-08-07 09:41:37 发布

Anla Likes Sunshine

最新推荐文章于 2023-08-07 09:41:37 发布

阅读量614

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/AnlaGodness/article/details/109448281

版权

大数据专栏收录该内容

48 篇文章 1 订阅

订阅专栏

1.1、需求：获取id1的最新id2

with t as (
select 1 as id1,80 as id2,1 as time 
union all
select 1 as id1,90 as id2,2 as time 
union all 
select 1 as id1,70 as id2,3 as time 
union all
select 2 as id1,20 as id2,1 as time 
union all
select 2 as id1,30 as id2,2 as time 
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,max(id2) over(partition by id1 order by time desc) latestid2 from t;

结果却是，所以要谨慎使用窗口函数的 order by：
在这里插入图片描述

预想目标列-latestid2是，通过first_value() 窗口函数才能实现此功能：
在这里插入图片描述

with t as (
select 1 as id1,80 as id2,1 as time 
union all
select 1 as id1,90 as id2,2 as time 
union all 
select 1 as id1,70 as id2,3 as time 
union all
select 2 as id1,20 as id2,1 as time 
union all
select 2 as id1,30 as id2,2 as time 
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,first_value(id2) over(partition by id1 order by time desc) latestid2 from t;

1.2、获取id1最大的id2，order by 可以去掉：

 with t as (
select 1 as id1,80 as id2,1 as time 
union all
select 1 as id1,90 as id2,2 as time 
union all 
select 1 as id1,70 as id2,3 as time 
union all
select 2 as id1,20 as id2,1 as time 
union all
select 2 as id1,30 as id2,2 as time 
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,max(id2) over(partition by id1) latestid2 from t;

效果：
在这里插入图片描述
2、新增指标开发流程：
2.1、测试库建表+增列
注意：分区表不能通过create table as select 的方式，需要show create table 开发库表，再copy 表结构到 create table 测试库表后面，否则建立的是非分区表；
2.2、将要增加的列与未增加列的开发库表对数，对数方式如2.2.1。关联方式的选择要通过理解源表业务和结果表业务：

如果能让关联键为同个范围，使用full outer join；
如果不能，则使用left join；

2.3、测试库运行、与开发库表对数，对数方式如2.3.1，如2.2.1对数完成，这一步应该可以省略；
2.4、开发库增列；
2.5、上线；

2.2.1、增加的列如何对数，注意count(列值)不会计入null 值，因此count(1) + 过滤条件为t2.id is null 才能查出t2表关联不上t1表的数量。

select count(t2.id) from (
     select * from 开发库表 
) t1 full outer join (
     增加的列逻辑
) t2 on t1.id = t2.id
where t1.id is null or t2.id is null;

2.3.1、测试库表如何对数

select count(t2.id) from (
        select * from 开发库表
) t1 full outer join (
    select * from 测试库表
) t2 on t1.id = t2.id
where t1.id is null or t2.id is null;

3、中位数
impala: appx_median(col)
hive: percentile、percentile_approx、ntile() 窗口函数
从以下测试例子可以看到，函数会自动对值排序，但是percentile、percentile_approx不一定会取值列表里的数据。于是摸索 ntile() 窗口函数来实现中位数功能，以下是过程记录。
3.1、impala 实现中位数功能，效果：

select appx_median(score) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--30
select appx_median(score) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--40

3.2、hive 的 percentile、percentile_approx函数，效果：

select percentile(score, 0.3) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--26
select percentile(score, array(0.5,0.5)) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--[40.0,40.0]
select percentile(score, 0.5) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--40
select percentile(score, 0.5) from (select 20 as score union all select 50 as score) t ;--35
select percentile_approx(score, 0.3) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--20
select percentile_approx(score, 0.5) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--30
select percentile_approx(score, 0.5) from (select score from (select 20 as score union all select 50 as score union all select 40 as score) t  order by score) tt;--30

3.3、hive 的 ntile() 窗口函数
3.3.1、ntile() 窗口函数，未分组的效果：

select score,ntile(3) over(order by score asc) from (select 20 as score union all select 50 as score union all select 40 as score) t;

在这里插入图片描述
选出中位数：

select score from (
select score,ntile(3) over(order by score asc) as loc from (select 20 as score union all select 50 as score union all select 40 as score) t
) tt where loc=2;

在这里插入图片描述
3个数，分成 2桶的效果：

select score,ntile(2) over(order by score asc) as loc from (select 20 as score union all select 50 as score union all select 40 as score) t

在这里插入图片描述
3.3.2、ntile() 窗口函数，单分组 3位数分成90桶案例：

1、select id,score,ntile(90) over(partition by id order by score asc) as loc from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t;
2、select score from (
select id,score,ntile(90) over(partition by id order by score asc) as loc from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) tt where loc=45;--none

效果为：
在这里插入图片描述

注意：ntile里面必须为定量，是变量的话，会报错，如：

select id,score,ntile(cnt) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t 
    group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) t2 on t1.id=t2.id
) tt

hive报错：
在这里插入图片描述
impala报错：

3.3.3、ntile() 窗口函数，单分组实现中位数案例：

select id,score
from (
select id,score,cnt,ntile(3) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t 
    group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) t2 on t1.id=t2.id
) tt
) ttt
where loc=round(cnt/2,0);

效果：
在这里插入图片描述
3.3.4、ntile() 窗口函数，多分组实现中位数案例，关键是ntile设置分桶数量必须 >= 分组最大数量：

select id,score
from (
select id,score,cnt,ntile(4) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score 
union all select 2 as id,80 as score union all select 2 as id,60 as score union all select 2 as id,55 as score union all select 2 as id,20 as score) t 
    group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score
union all select 2 as id,80 as score union all select 2 as id,60 as score union all select 2 as id,55 as score union all select 2 as id,20 as score) t
) t2 on t1.id=t2.id
) tt
) ttt
where loc=round(cnt/2,0);

效果：
在这里插入图片描述
4、使用count 注意，count(列) 当列值为null时，不会计入。

select count(1) from (select null as id union all select null as id) t;--2 
select count(id) from (select null as id union all select null as id) t;--0 count(null)不会计入

Anla Likes Sunshine

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
杂记3_first_value窗口+表追加指标开发+中位数+ntile窗口开发

1.1、需求：获取id1的最新id2with t as (select 1 as id1,80 as id2,1 as time union allselect 1 as id1,90 as id2,2 as time union all select 1 as id1,70 as id2,3 as time union allselect 2 as id1,20 as id2,1 as time union allselect 2 as id1,30 as id2,2 as time
复制链接

扫一扫