需求描述
将同一用户的多段连续日期范围,进行合并。
id | 开始时间 | 结束时间 |
---|---|---|
1 | 2020-01-01 | 2020-01-07 |
1 | 2020-01-08 | 2020-01-20 |
1 | 2020-01-21 | 2020-01-30 |
要合并成
id | 开始时间 | 结束时间 |
---|---|---|
1 | 2020-01-01 | 2020-01-20 |
1 | 2020-01-21 | 2020-01-30 |
解决思路
- 根据id分组开窗,根据开始时间、结束时间排序,取每行的上一条记录的结束时间。
- 如果当前行的开始时间小于上条记录的结束时间,则当前行与上一行所属在相同的合并周期内
- 根据id和合并周期分组,取最大、最小时间。就得出每个合并周期内的开始结束时间。
-- 1. 根据id分组开窗,根据开始时间、结束时间排序,取每行的上一条记录的结束时间。
SELECT startdate,
enddate,
granularity1,
lag(enddate)OVER (PARTITION BY granularity1
ORDER BY startdate, enddate) AS lagenddate
FROM VALUES
(1, '2020-01-01', '2020-04-31', 'Value1', 'ValueA'),
(1, '2020-04-01', '2020-06-30', 'Value1', 'ValueB'),
(1, '2020-07-01', '2020-09-30', 'Value1', 'ValueC'),
(1, '2020-10-01', '2020-10-30', 'Value1', 'ValueD')
AS t(id, startdate, enddate, granularity1, granularity2)
-- 2. 如果当前行的开始时间小于上条记录的结束时间,则当前行与上一行所属在相同的合并周期内
SELECT startdate,
enddate,
lagenddate,
granularity1,
sum(CASE
WHEN lagenddate >= startdate THEN 0
ELSE 1
END)OVER (PARTITION BY granularity1
ORDER BY startdate) AS groupid
FROM (
SELECT startdate,
enddate,
granularity1,
lag(enddate)OVER (PARTITION BY granularity1
ORDER BY startdate) AS lagenddate
FROM VALUES
(1, '2020-01-01', '2020-04-31', 'Value1', 'ValueA'),
(1, '2020-04-01', '2020-06-30', 'Value1', 'ValueB'),
(1, '2020-07-01', '2020-09-30', 'Value1', 'ValueC'),
(1, '2020-10-01', '2020-10-30', 'Value1', 'ValueD')
AS t(id, startdate, enddate, granularity1, granularity2)
) AS t
--3. 根据id和合并周期分组,取最大、最小时间。就得出每个合并周期内的开始结束时间。
SELECT min(startdate) AS startdate,
max(enddate) AS enddate,
granularity1,
groupid
FROM (
SELECT startdate,
enddate,
lagenddate,
granularity1,
sum(CASE
WHEN lagenddate >= startdate THEN 0
ELSE 1
END)OVER (PARTITION BY granularity1
ORDER BY startdate) AS groupid
FROM (
SELECT startdate,
enddate,
granularity1,
lag(enddate)OVER (PARTITION BY granularity1
ORDER BY startdate) AS lagenddate
FROM VALUES
(1, '2020-01-01', '2020-04-31', 'Value1', 'ValueA'),
(1, '2020-04-01', '2020-06-30', 'Value1', 'ValueB'),
(1, '2020-07-01', '2020-09-30', 'Value1', 'ValueC'),
(1, '2020-10-01', '2020-10-30', 'Value1', 'ValueD')
AS t(id, startdate, enddate, granularity1, granularity2)
) AS t
) AS t
GROUP BY granularity1, groupid
;