题目
取出每月连续 5 天有登录记录的用户。
解题方法
create table member_log
as
with detail as (
select '4/11/2020' as date_d , 'A' as member_code , 21 as sale_amt
union all select '4/12/2020' as date_d , 'A' as member_code , 23 as sale_amt
union all select '4/13/2020' as date_d , 'A' as member_code , 45 as sale_amt
union all select '4/14/2020' as date_d , 'A' as member_code , 43 as sale_amt
union all select '4/15/2020' as date_d , 'A' as member_code , 12 as sale_amt
union all select '4/16/2020' as date_d , 'A' as member_code , 34 as sale_amt
union all select '4/17/2020' as date_d , 'A' as member_code , 65 as sale_amt
union all select '4/18/2020' as date_d , 'A' as member_code , 54 as sale_amt
union all select '4/19/2020' as date_d , 'A' as member_code , 43 as sale_amt
union all select '4/20/2020' as date_d , 'A' as member_code , 23 as sale_amt
union all select '4/21/2020' as date_d , 'A' as member_code , 43 as sale_amt
union all select '4/22/2020' as date_d , 'A' as member_code , 78 as sale_amt
union all select '4/11/2020' as date_d , 'B' as member_code , 21 as sale_amt
union all select '4/12/2020' as date_d , 'B' as member_code , 23 as sale_amt
union all select '4/13/2020' as date_d , 'B' as member_code , 45 as sale_amt
), dim_date as (
select '4/11/2020' as date_d
union all select '4/12/2020' as date_d
union all select '4/13/2020' as date_d
union all select '4/14/2020' as date_d
union all select '4/15/2020' as date_d
union all select '4/16/2020' as date_d
union all select '4/17/2020' as date_d
union all select '4/18/2020' as date_d
union all select '4/19/2020' as date_d
union all select '4/20/2020' as date_d
union all select '4/21/2020' as date_d
union all select '4/22/2020' as date_d
), dim_member as (
select 'A' as member_code
union all select 'B' as member_code
), dim_all as (
select a.date_d
,b.member_code
from dim_date as a
cross join dim_member as b
)
select a.member_code , concat_ws('',collect_list(if(b.date_d is null , '0' , '1'))) as co
from dim_all as a
left join detail as b
on a.date_d = b.date_d
and a.member_code = b.member_code
group by a.member_code
我的思路是使用 collect_list 将行上的数据转到列上。得到表 member_log,它的数据如下:
member_code | 登录情况 |
---|---|
A | 111111111111 |
A | 111000000000 |
最后使用这则表达式,匹配到连续 5 次,1{5,100}
,所以最终的 sql 如下所示:
select member_code ,co
from member_log
where length(regexp_extract(co , '(1{5,100})' , 1)) > 5
还可以优化一下,以月为单位,用 1 表示上线,用 0 表示没有下线。
每当新来一天的数据,就加到 co 字段的后面。
总结
这里用到了行转列和正则表达式。关键式正则表达式使用比较新颖,将正则表达式中的连续性应用到这个题目中。这种做法省去了 row_number 的做法,效率也会搞很多。