ntile()函数的作用是等频分箱,把观测值进行有序排列(默认升序),根据观测值的总个数等分为k部分,每部分当作一个分箱,即百分位数的概念,可以根据箱号选取前或后n分之几的数据。
函数方法:
1、ntile(n) over(order by col) as bucket_num
n是指定的分箱数量。如果不能平均分配,则优先分配较小编号的箱,并且各个箱中能放的行数最多相差1。
备注:NULL值的处理,可以设置单独为一组,或者默认为最小值
select col
-- NULL默认为最小值
, ntile(2) over( order by col) as group1
-- 将NULL单独为1组
, if(col is null, null, ntile(2) over( partition by if(col is null, 1, 0) order by col)) as group2
from
(
select cast(col as int) as col
from
(
select stack(5, 'NULL', '1', '2', '3', '4') as col
) as a
) as a;
结果:
col group1 group2
1 1 1
2 1 1
3 2 2
4 2 2
NULL 1 NULL
Time taken: 89.01 seconds, Fetched: 5 row(s)
2、percent_rank() over(order by col):先得出每个值对应的百分位数,再根据实际需求分箱
步骤1、
select col
-- NULL默认为最小值
, percent_rank() over( order by col) as group1
-- 将NULL单独为1组
, if(col is null, null, percent_rank() over( partition by if(col is null, 1, 0) order by col)) as group2
from(
select cast(col as int) as col
from
(
select stack(5, 'NULL', '1', '2', '3', '4') as col
) as a
) as a
结果:
col group1 group2
1 0.25 0.0
2 0.5 0.3333333333333333
3 0.75 0.6666666666666666
4 1.0 1.0
NULL 0.0 NULL
Time taken: 129.701 seconds, Fetched: 5 row(s)
步骤2、
select col
-- 根据百分位数划分
, if(group1<0.5, 1, if(group1<=1.0, 2, null)) as group1
, if(group2<0.5, 1, if(group2<=1.0, 2, null)) as group2
from
(
select col
-- NULL默认为最小值
, percent_rank() over( order by col) as group1
-- 将NULL单独为1组
, if(col is null, null, percent_rank() over( partition by if(col is null, 1, 0) order by col)) as group2
from(
select cast(col as int) as col
from
(
select stack(5, 'NULL', '1', '2', '3', '4') as col
) as a
) as a
) as a
结果:
col group1 group2
1 1 1
2 2 1
3 2 2
4 2 2
NULL 1 NULL
Time taken: 0.355 seconds, Fetched 5 row(s)
3、hive等分数据示例
select
t1.cookieid
,t1.createtime
,t1.pv
,ntile(2) over(order by t1.pv) as pt --分组内将数据分成2片
,row_number() over(order by t1.pv) as rn
,count(1) over() as cn
from
(
select 'cookie1' as cookieid ,'2015-04-10' as createtime, 1 as pv union all
select 'cookie1' as cookieid ,'2015-04-11' as createtime, 2 as pv union all
select 'cookie1' as cookieid ,'2015-04-12' as createtime, 3 as pv union all
select 'cookie1' as cookieid ,'2015-04-13' as createtime, 4 as pv union all
select 'cookie1' as cookieid ,'2015-04-14' as createtime, 5 as pv union all
select 'cookie1' as cookieid ,'2015-04-15' as createtime, 6 as pv union all
select 'cookie1' as cookieid ,'2015-04-16' as createtime, 7 as pv union all
select 'cookie2' as cookieid ,'2015-04-10' as createtime, 8 as pv union all
select 'cookie2' as cookieid ,'2015-04-11' as createtime, 9 as pv union all
select 'cookie2' as cookieid ,'2015-04-12' as createtime, 10 as pv union all
select 'cookie2' as cookieid ,'2015-04-13' as createtime, 11 as pv union all
select 'cookie2' as cookieid ,'2015-04-14' as createtime, 12 as pv union all
select 'cookie2' as cookieid ,'2015-04-15' as createtime, 13 as pv union all
select 'cookie2' as cookieid ,'2015-04-16' as createtime, 14 as pv
) t1
结果:
cookieid createtime pv pt rn cn
cookie1 2015-04-10 1 1 1 14
cookie1 2015-04-11 2 1 2 14
cookie1 2015-04-12 3 1 3 14
cookie1 2015-04-13 4 1 4 14
cookie1 2015-04-14 5 1 5 14
cookie1 2015-04-15 6 1 6 14
cookie1 2015-04-16 7 1 7 14
cookie2 2015-04-10 8 2 8 14
cookie2 2015-04-11 9 2 9 14
cookie2 2015-04-12 10 2 10 14
cookie2 2015-04-13 11 2 11 14
cookie2 2015-04-14 12 2 12 14
cookie2 2015-04-15 13 2 13 14
cookie2 2015-04-16 14 2 14 14
Time taken: 0.422 seconds, Fetched 14 row(s)
4、增加排序模块
select
t3.*
from
(
select
t2.*
,row_number() over(partition by t2.pt order by t2.pv) as rn2
from
(
select
t1.cookieid
,t1.createtime
,t1.pv
,ntile(2) over(order by t1.pv) as pt --分组内将数据分成2片
,row_number() over(order by t1.pv) as rn
,count(1) over() as cn
from
(
select 'cookie1' as cookieid ,'2015-04-10' as createtime, 1 as pv union all
select 'cookie1' as cookieid ,'2015-04-11' as createtime, 2 as pv union all
select 'cookie1' as cookieid ,'2015-04-12' as createtime, 3 as pv union all
select 'cookie1' as cookieid ,'2015-04-13' as createtime, 4 as pv union all
select 'cookie1' as cookieid ,'2015-04-14' as createtime, 5 as pv union all
select 'cookie1' as cookieid ,'2015-04-15' as createtime, 6 as pv union all
select 'cookie1' as cookieid ,'2015-04-16' as createtime, 7 as pv union all
select 'cookie2' as cookieid ,'2015-04-10' as createtime, 8 as pv union all
select 'cookie2' as cookieid ,'2015-04-11' as createtime, 9 as pv union all
select 'cookie2' as cookieid ,'2015-04-12' as createtime, 10 as pv union all
select 'cookie2' as cookieid ,'2015-04-13' as createtime, 11 as pv union all
select 'cookie2' as cookieid ,'2015-04-14' as createtime, 12 as pv union all
select 'cookie2' as cookieid ,'2015-04-15' as createtime, 13 as pv union all
select 'cookie2' as cookieid ,'2015-04-16' as createtime, 14 as pv
) t1
) t2
) t3
where t3.rn2 = 1 or t3.rn = t3.cn;
结果:
cookieid createtime pv pt rn cn rn2
cookie1 2015-04-10 1 1 1 14 1
cookie2 2015-04-10 8 2 8 14 1
cookie2 2015-04-16 14 2 14 14 7
Time taken: 3.119 seconds, Fetched 3 row(s)