题目
用户不超过2天即视为连续登录,求用户最大连续登录天数,如用户A分别在1、3、5、6号进行登录,则用户连续登录天数为6天。
表和数据
DROP TABLE IF EXISTS test_login_events;
CREATE EXTERNAL TABLE IF NOT EXISTS test_login_events
(
user_id INT COMMENT '用户id',
login_datetime STRING COMMENT '登录时间'
) COMMENT '直播间访问记录' STORED AS ORC TBLPROPERTIES ( "orc.compress" = "SNAPPY", "discover.partitions" = "false" );
INSERT OVERWRITE TABLE test_login_events VALUES
(100, '2023-12-01 19:00:00'),
(100, '2023-12-01 19:30:00'),
(100, '2023-12-02 21:01:00'),
(100, '2023-12-03 11:01:00'),
(101, '2023-12-01 19:05:00'),
(101, '2023-12-01 21:05:00'),
(101, '2023-12-03 21:05:00'),
(101, '2023-12-05 15:05:00'),
(101, '2023-12-06 19:05:00'),
(102, '2023-12-01 19:55:00'),
(102, '2023-12-01 21:05:00'),
(102, '2023-12-02 21:57:00'),
(102, '2023-12-03 19:10:00'),
(104, '2023-12-04 21:57:00'),
(104, '2023-12-02 22:57:00'),
(105, '2023-12-01 10:01:00');
解题
- 先对数据进行进行去重处理,每天只取第一次登录的数据
- 计算时间间隔,大于2的标记为1,否则为0
- 再通过flag计算用户登录的session_id
- 计算用户每个session_id的用户连续登录天数(最大最小日期的差值)
WITH temp1 AS (
SELECT
user_id,
login_datetime,
date_format(login_datetime, 'yyyy-MM-dd') AS login_date
FROM
test_login_events
), temp2 AS (
SELECT
user_id,
login_datetime,
login_date,
row_number() over (partition by user_id, login_date order by login_datetime) AS rn
FROM
temp1
), temp3 AS (
SELECT
user_id,
login_datetime,
login_date
FROM
temp2
WHERE
rn = 1
), temp4 AS (
SELECT
user_id,
login_datetime,
login_date,
lag(login_date) over (partition by user_id order by login_date) as last_login_date,
datediff(login_date, lag(login_date) over (partition by user_id order by login_date)) as login_diff
FROM
temp3
), temp5 AS (
SELECT
user_id,
login_datetime,
login_date,
last_login_date,
login_diff,
if(login_diff > 2, 1, 0) as login_session_flag
FROM
temp4
), temp6 AS (
SELECT
user_id,
login_datetime,
login_date,
last_login_date,
login_diff,
login_session_flag,
concat(user_id, '_', sum(login_session_flag) over (partition by user_id order by login_datetime)) as session_id
FROM
temp5
), temp7 AS (
SELECT
user_id,
session_id,
datediff(max(login_date), min(login_date)) + 1 as max_days
FROM
temp6 a
GROUP BY
user_id,
session_id
)
SELECT
user_id,
max(max_days) as max_days
FROM
temp7 a
GROUP BY
user_id
;
temp6数据
+------------+----------------------+---------------+--------------------+---------------+-----------------------+---------------+
| a.user_id | a.login_datetime | a.login_date | a.last_login_date | a.login_diff | a.login_session_flag | a.session_id |
+------------+----------------------+---------------+--------------------+---------------+-----------------------+---------------+
| 100 | 2023-12-01 19:00:00 | 2023-12-01 | NULL | NULL | 0 | 100_0 |
| 100 | 2023-12-02 21:01:00 | 2023-12-02 | 2023-12-01 | 1 | 0 | 100_0 |
| 100 | 2023-12-03 11:01:00 | 2023-12-03 | 2023-12-02 | 1 | 0 | 100_0 |
| 101 | 2023-12-01 19:05:00 | 2023-12-01 | NULL | NULL | 0 | 101_0 |
| 101 | 2023-12-03 21:05:00 | 2023-12-03 | 2023-12-01 | 2 | 0 | 101_0 |
| 101 | 2023-12-05 15:05:00 | 2023-12-05 | 2023-12-03 | 2 | 0 | 101_0 |
| 101 | 2023-12-06 19:05:00 | 2023-12-06 | 2023-12-05 | 1 | 0 | 101_0 |
| 102 | 2023-12-01 19:55:00 | 2023-12-01 | NULL | NULL | 0 | 102_0 |
| 102 | 2023-12-02 21:57:00 | 2023-12-02 | 2023-12-01 | 1 | 0 | 102_0 |
| 102 | 2023-12-03 19:10:00 | 2023-12-03 | 2023-12-02 | 1 | 0 | 102_0 |
| 104 | 2023-12-02 22:57:00 | 2023-12-02 | NULL | NULL | 0 | 104_0 |
| 104 | 2023-12-04 21:57:00 | 2023-12-04 | 2023-12-02 | 2 | 0 | 104_0 |
| 105 | 2023-12-01 10:01:00 | 2023-12-01 | NULL | NULL | 0 | 105_0 |
+------------+----------------------+---------------+--------------------+---------------+-----------------------+---------------+
temp7数据
+----------+-------------+-----------+
| user_id | session_id | max_days |
+----------+-------------+-----------+
| 100 | 100_0 | 2 |
| 101 | 101_0 | 5 |
| 105 | 105_0 | 0 |
| 102 | 102_0 | 2 |
| 104 | 104_0 | 2 |
+----------+-------------+-----------+
最总结果
+----------+-----------+
| user_id | max_days |
+----------+-----------+
| 100 | 3 |
| 102 | 3 |
| 104 | 3 |
| 101 | 6 |
| 105 | 1 |
+----------+-----------+