这个函数的大致意思是:在分组内求去重后的数量
为什么不用count(distinct ),不支持原因
例子:
数据准备:
SELECT '1' as id ,'201808' as m,'a' as k
union all
SELECT '2' as id ,'201808' as m,'a' as k
union all
SELECT '1' as id ,'201809' as m,'a' as k
union all
SELECT '1' as id ,'201808' as m,'b' as k
union all
SELECT '2' as id ,'201809' as m,'b' as k
id代表人编号,m代表月份,k代表其他key键。
id | m | k |
---|---|---|
1 | 201808 | a |
2 | 201808 | a |
1 | 201809 | a |
1 | 201808 | b |
2 | 201809 | b |
需求:本月累计人数(即9月份的客户要包含9月以前的客户数)
预计结果
k m n
a 201808 2
a 201809 2
b 201808 1
b 201809 2
第一步:
SELECT
k,
m,
size(collect_set(id) OVER(PARTITION BY k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )) as n
from
(
SELECT '1' as id ,'201808' as m,'a' as k
union all
SELECT '2' as id ,'201808' as m,'a' as k
union all
SELECT '1' as id ,'201809' as m,'a' as k
union all
SELECT '1' as id ,'201808' as m,'b' as k
union all
SELECT '2' as id ,'201809' as m,'b' as k
)t
order by k,m
k | m | n |
---|---|---|
a | 201808 | 1 |
a | 201808 | 2 |
a | 201809 | 2 |
b | 201808 | 1 |
b | 201809 | 2 |
第二步:取出分组内最后一条即可
SELECT
k,
m,
n,
row_number() over(PARTITION BY k,m ORDER BY n desc)rk
from
(
SELECT
k,
m,
size(collect_set(id) OVER(PARTITION BY k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW )) as n
from
(
SELECT '1' as id ,'201808' as m,'a' as k
union all
SELECT '2' as id ,'201808' as m,'a' as k
union all
SELECT '1' as id ,'201809' as m,'a' as k
union all
SELECT '1' as id ,'201808' as m,'b' as k
union all
SELECT '2' as id ,'201809' as m,'b' as k
)t
order by k,m
)having rk = 1
结果为:
k | m | n | rk |
---|---|---|---|
a | 201808 | 2 | 1 |
a | 201809 | 2 | 1 |
b | 201808 | 1 | 1 |
b | 201809 | 2 | 1 |
若需要了解rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW,请参考https://mp.csdn.net/postedit/82909903