目标:求每个用户,连续访问最多的天数
('a', '2023-01-07', 67)
('a', '2023-01-05', 82)
('a', '2023-01-08', 76)
('b', '2023-01-07', 53)
('b', '2023-01-08', 91)
('b', '2023-01-09', 87)
('c', '2023-01-27', 73)
('c', '2023-01-27', 61)
('c', '2023-01-25', 97)
('c', '2023-01-26', 58)
分析:
1)《求每个用户,连续访问最多的天数》,所以按照用户进行分区 partition by (name)
2) 某个用户,一天内可能有多次访问记录,例如(‘c’, ‘2023-01-27’),所以分区内需要去重,选择去重引擎ReplacingMergeTree,去重时依据的是 order by 字段,所以 ORDER BY (name, cDate)
-- 1、分区
-- 2、分区内去重
-- 创建去重表
drop table tb_web_visit;
CREATE TABLE tb_web_visit
(
`name` String,
`cDate` Date,
`cnt` String
) ENGINE = ReplacingMergeTree
partition by (name)
ORDER BY (name, cDate);
-- 向表中插入数据
INSERT INTO table tb_web_visit
VALUES
('a', '2023-01-07', 67),
('a', '2023-01-05', 82),
('a', '2023-01-08', 76),
('b', '2023-01-07', 53),
('b', '2023-01-08', 91),
('b', '2023-01-09', 87),
('c', '2023-01-27', 73),
('c', '2023-01-27', 61),
('c', '2023-01-25', 97),
('c', '2023-01-26', 58);
optimize table tb_web_visit final;
select *
from tb_web_visit;
1、构造炸裂条件
select name,
groupArray(cDate) as collect_list_dt,
groupArray(cnt) as collect_list_cnt
from tb_web_visit
group by name;
2、炸裂后构造序号
select name,
dt,
cnt,
idx
from (
select name,
groupArray(cDate) as collect_list_dt,
groupArray(cnt) as collect_list_cnt
from tb_web_visit
group by name
) t1
ARRAY JOIN
collect_list_dt as dt,
collect_list_cnt as cnt,
arrayEnumerate(collect_list_dt) as idx;
3、构造分组条件
select name,
dt,
dateSub(dt, interval idx day) as grp, -- 构造分组
cnt,
idx
from (
select name,
groupArray(cDate) as collect_list_dt,
groupArray(cnt) as collect_list_cnt
from tb_web_visit
group by name
) t1
ARRAY JOIN
collect_list_dt as dt,
collect_list_cnt as cnt,
arrayEnumerate(collect_list_dt) as idx;
4、分组内求每个用户的连续访问次数
-- 分组内求每个用户的连续访问次数
select name,
count(1) as cnt
from (
select name,
dt,
dateSub(dt, interval idx day) as grp,
cnt,
idx
from (
select name,
groupArray(cDate) as collect_list_dt,
groupArray(cnt) as collect_list_cnt
from tb_web_visit
group by name
) t1
ARRAY JOIN
collect_list_dt as dt,
collect_list_cnt as cnt,
arrayEnumerate(collect_list_dt) as idx
) t2
group by name, grp;
5、每个用户,求最大
select name,
max(cnt)
from (
select name,
count(1) as cnt
from (
select name,
dt,
dateSub(dt, interval idx day) as grp,
cnt,
idx
from (
select name,
groupArray(cDate) as collect_list_dt,
groupArray(cnt) as collect_list_cnt
from tb_web_visit
group by name
) t1
ARRAY JOIN
collect_list_dt as dt,
collect_list_cnt as cnt,
arrayEnumerate(collect_list_dt) as idx
) t2
group by name, grp
) t3
group by name;