1.1、需求:获取id1的最新id2
with t as (
select 1 as id1,80 as id2,1 as time
union all
select 1 as id1,90 as id2,2 as time
union all
select 1 as id1,70 as id2,3 as time
union all
select 2 as id1,20 as id2,1 as time
union all
select 2 as id1,30 as id2,2 as time
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,max(id2) over(partition by id1 order by time desc) latestid2 from t;
结果却是,所以要谨慎使用窗口函数的 order by:
预想目标列-latestid2是,通过first_value() 窗口函数才能实现此功能:
with t as (
select 1 as id1,80 as id2,1 as time
union all
select 1 as id1,90 as id2,2 as time
union all
select 1 as id1,70 as id2,3 as time
union all
select 2 as id1,20 as id2,1 as time
union all
select 2 as id1,30 as id2,2 as time
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,first_value(id2) over(partition by id1 order by time desc) latestid2 from t;
1.2、获取id1最大的id2,order by 可以去掉:
with t as (
select 1 as id1,80 as id2,1 as time
union all
select 1 as id1,90 as id2,2 as time
union all
select 1 as id1,70 as id2,3 as time
union all
select 2 as id1,20 as id2,1 as time
union all
select 2 as id1,30 as id2,2 as time
union all
select 2 as id1,40 as id2,3 as time
)
select id1,id2,time,max(id2) over(partition by id1) latestid2 from t;
效果:
2、新增指标开发流程:
2.1、测试库建表+增列
注意:分区表不能通过create table as select 的方式,需要show create table 开发库表,再copy 表结构到 create table 测试库表后面,否则建立的是非分区表;
2.2、将要增加的列 与 未增加列的开发库表 对数,对数方式如2.2.1。关联方式的选择要通过理解源表业务和结果表业务:
- 如果能让关联键为同个范围,使用full outer join;
- 如果不能,则使用left join;
2.3、测试库运行、与 开发库表 对数,对数方式如2.3.1,如2.2.1对数完成,这一步应该可以省略;
2.4、开发库增列;
2.5、上线;
2.2.1、增加的列 如何对数,注意count(列值)不会计入null 值,因此count(1) + 过滤条件为t2.id is null 才能查出t2表关联不上t1表的数量。
select count(t2.id) from (
select * from 开发库表
) t1 full outer join (
增加的列逻辑
) t2 on t1.id = t2.id
where t1.id is null or t2.id is null;
2.3.1、测试库表 如何对数
select count(t2.id) from (
select * from 开发库表
) t1 full outer join (
select * from 测试库表
) t2 on t1.id = t2.id
where t1.id is null or t2.id is null;
3、中位数
impala: appx_median(col)
hive: percentile、percentile_approx、ntile() 窗口函数
从以下测试例子可以看到,函数会自动对值排序,但是percentile、percentile_approx不一定会取值列表里的数据。于是摸索 ntile() 窗口函数来实现中位数功能,以下是过程记录。
3.1、impala 实现中位数功能,效果:
select appx_median(score) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--30
select appx_median(score) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--40
3.2、hive 的 percentile、percentile_approx函数,效果:
select percentile(score, 0.3) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--26
select percentile(score, array(0.5,0.5)) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--[40.0,40.0]
select percentile(score, 0.5) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--40
select percentile(score, 0.5) from (select 20 as score union all select 50 as score) t ;--35
select percentile_approx(score, 0.3) from (select 20 as score union all select 30 as score union all select 40 as score) t ;--20
select percentile_approx(score, 0.5) from (select 20 as score union all select 50 as score union all select 40 as score) t ;--30
select percentile_approx(score, 0.5) from (select score from (select 20 as score union all select 50 as score union all select 40 as score) t order by score) tt;--30
3.3、hive 的 ntile() 窗口函数
3.3.1、ntile() 窗口函数,未分组的效果:
select score,ntile(3) over(order by score asc) from (select 20 as score union all select 50 as score union all select 40 as score) t;
选出中位数:
select score from (
select score,ntile(3) over(order by score asc) as loc from (select 20 as score union all select 50 as score union all select 40 as score) t
) tt where loc=2;
3个数,分成 2桶 的效果:
select score,ntile(2) over(order by score asc) as loc from (select 20 as score union all select 50 as score union all select 40 as score) t
3.3.2、ntile() 窗口函数,单分组 3位数 分成90桶案例:
1、select id,score,ntile(90) over(partition by id order by score asc) as loc from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t;
2、select score from (
select id,score,ntile(90) over(partition by id order by score asc) as loc from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) tt where loc=45;--none
效果为:
注意:ntile里面必须为定量,是变量的话,会报错,如:
select id,score,ntile(cnt) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) t2 on t1.id=t2.id
) tt
hive报错:
impala报错:
3.3.3、ntile() 窗口函数,单分组实现中位数案例:
select id,score
from (
select id,score,cnt,ntile(3) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score) t
) t2 on t1.id=t2.id
) tt
) ttt
where loc=round(cnt/2,0);
效果:
3.3.4、ntile() 窗口函数,多分组实现中位数案例,关键是ntile设置分桶数量必须 >= 分组最大数量:
select id,score
from (
select id,score,cnt,ntile(4) over(partition by id order by score asc) as loc
from (
select t1.id,t1.cnt,score
from (select id,count(1) cnt from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score
union all select 2 as id,80 as score union all select 2 as id,60 as score union all select 2 as id,55 as score union all select 2 as id,20 as score) t
group by id) t1
join (select id,score from (select 1 as id,20 as score union all select 1 as id,50 as score union all select 1 as id,40 as score
union all select 2 as id,80 as score union all select 2 as id,60 as score union all select 2 as id,55 as score union all select 2 as id,20 as score) t
) t2 on t1.id=t2.id
) tt
) ttt
where loc=round(cnt/2,0);
效果:
4、使用count 注意,count(列) 当列值为null时,不会计入。
select count(1) from (select null as id union all select null as id) t;--2
select count(id) from (select null as id union all select null as id) t;--0 count(null)不会计入