描述
用户行为日志表tb_user_log
id uid artical_id in_time out_time sign_cin
1 101 0 2021-11-01 10:00:00 2021-11-01 10:00:42 1
2 102 9001 2021-11-01 10:00:00 2021-11-01 10:00:09 0
3 103 9001 2021-11-01 10:00:01 2021-11-01 10:01:50 0
4 101 9002 2021-11-02 10:00:09 2021-11-02 10:00:28 0
5 103 9002 2021-11-02 10:00:51 2021-11-02 10:00:59 0
6 104 9001 2021-11-02 10:00:28 2021-11-02 10:00:50 0
7 101 9003 2021-11-03 11:00:55 2021-11-03 11:01:24 0
8 104 9003 2021-11-03 11:00:45 2021-11-03 11:00:55 0
9 105 9003 2021-11-03 11:00:53 2021-11-03 11:00:59 0
10 101 9002 2021-11-04 11:00:55 2021-11-04 11:00:59 0
问题:统计2021年11月每天新用户的次日留存率(保留2位小数)
注:
次日留存率为当天新增的用户数中第二天又活跃了的用户数占比。
如果in_time-进入时间和out_time-离开时间跨天了,在两天里都记为该用户活跃过,结果按日期升序。
建表语句:
DROP TABLE IF EXISTS tb_user_log;
CREATE TABLE tb_user_log (
id INT PRIMARY KEY AUTO_INCREMENT COMMENT '自增ID',
uid INT NOT NULL COMMENT '用户ID',
artical_id INT NOT NULL COMMENT '视频ID',
in_time datetime COMMENT '进入时间',
out_time datetime COMMENT '离开时间',
sign_in TINYINT DEFAULT 0 COMMENT '是否签到'
) CHARACTER SET utf8 COLLATE utf8_bin;
INSERT INTO tb_user_log(uid, artical_id, in_time, out_time, sign_in) VALUES
(101, 0, '2021-11-01 10:00:00', '2021-11-01 10:00:42', 1),
(102, 9001, '2021-11-01 10:00:00', '2021-11-01 10:00:09', 0),
(103, 9001, '2021-11-01 10:00:01', '2021-11-01 10:01:50', 0),
(101, 9002, '2021-11-02 10:00:09', '2021-11-02 10:00:28', 0),
(103, 9002, '2021-11-02 10:00:51', '2021-11-02 10:00:59', 0),
(104, 9001, '2021-11-02 10:00:28', '2021-11-02 10:00:50', 0),
(101, 9003, '2021-11-03 11:00:55', '2021-11-03 11:01:24', 0),
(104, 9003, '2021-11-03 11:00:45', '2021-11-03 11:00:55', 0),
(105, 9003, '2021-11-03 11:00:53', '2021-11-03 11:00:59', 0),
(101, 9002, '2021-11-04 11:00:55', '2021-11-04 11:00:59', 0);
解法:
基本的思路是:用union把in_time和out_time并联起来,对uid和date去重活获得一张用户活跃表。之后找出次活用户和活跃用户,计算留存率即可
细节:没有新用户不用输出,留存率保留2位小数,结果按日期升序,时间范围为2021年11月。
关联in_time和out_time字段,建立用户活跃表
SELECT DISTINCT uid,DATE(in_time) AS dt FROM tb_user_log
UNION--UNION去重,UNION ALL不去重
SELECT DISTINCT uid,DATE(out_time) AS dt FROM tb_user_log
找出新用户和次活新用户,并用‘1’来表示。
WITH t1 AS(
SELECT DISTINCT uid,DATE(in_time) AS dt FROM tb_user_log
UNION
SELECT DISTINCT uid,DATE(out_time) AS dt FROM tb_user_log
)
SELECT dt,
CASE WHEN (uid,dt)IN (SELECT uid,date_add(dt,INTERVAL -1 DAY) FROM t1) AND (uid,dt)IN (SELECT uid,MIN(dt)FROM t1 GROUP BY uid)
THEN 1 ELSE 0 END AS next_user ,#次活新用户
CASE WHEN (uid,dt) IN (SELECT uid,MIN(dt)FROM t1 GROUP BY uid)#新用户用户
THEN 1 ELSE 0 END new_user
FROM t1
WHERE DATE_FORMAT(dt,'%Y%m')='202111';#取2021年11月的数据
计算每日的次活用户数和新用户数,并求新用户次日留存率。
- 每日次活用户数:SUM(next_user )
- 每日新用户数:SUM(new_user)
- 次日留存率,保留2位小数ROUND(SUM(next_user )/SUM(new_user),2)
计算留存率:
ROUND(
SUM(CASE WHEN (uid,dt)IN
(SELECT uid,date_add(dt,INTERVAL -1 DAY) FROM t1)
AND (uid,dt)IN (SELECT uid,MIN(dt) FROM t1 GROUP BY uid)
THEN 1 ELSE 0 END)
/
SUM(CASE WHEN (uid,dt) IN
(SELECT uid,MIN(dt)FROM t1 GROUP BY uid)
THEN 1 ELSE 0 END),2)
AS uv_left_rate
完整代码:
WITH t1 AS(
SELECT DISTINCT uid,DATE(in_time) AS dt FROM tb_user_log
UNION
SELECT DISTINCT uid,DATE(out_time) AS dt FROM tb_user_log
)#用户活跃表
SELECT dt,
ROUND(SUM(CASE WHEN (uid,dt)IN
(SELECT uid,date_add(dt,INTERVAL -1 DAY) FROM t1)
AND (uid,dt)IN (SELECT uid,MIN(dt) FROM t1 GROUP BY uid) THEN 1 ELSE 0 END)
/
SUM(CASE WHEN (uid,dt) IN
(SELECT uid,MIN(dt)FROM t1 GROUP BY uid) THEN 1 ELSE 0 END),2)
AS uv_left_rate
FROM t1
WHERE DATE_FORMAT(dt,'%Y%m')='202111'
GROUP BY dt
HAVING uv_left_rate IS NOT NULL
ORDER BY dt;
解法二:窗口函数,个人感觉这个好理解
照例找出用户活跃表
通过LEAD窗口函数对每个用户的活跃日向上移动一行
SELECT uid,dt,
MIN(dt) OVER (PARTITION BY uid) AS new_dt,
LEAD(dt,1) OVER (PARTITION BY uid ORDER BY dt) AS next_dt
--取下一日向上移动
FROM(
SELECT DISTINCT uid,DATE(in_time) AS dt FROM tb_user_log
UNION
SELECT DISTINCT uid,DATE(out_time) AS dt FROM tb_user_log
)t1
WHERE DATE_FORMAT(dt,'%Y%m')='202111'
如果dt=new_dt那这个用户为新用户,如果dt=new_dt且next_dt和new_dt的日期差为1则这个用户为次留新用户
WITH t2 AS(
SELECT uid,dt,
MIN(dt) OVER (PARTITION BY uid) AS new_dt,
LEAD(dt,1) OVER (PARTITION BY uid ORDER BY dt) AS next_dt
FROM(
SELECT DISTINCT uid,DATE(in_time) AS dt FROM tb_user_log
UNION
SELECT DISTINCT uid,DATE(out_time) AS dt FROM tb_user_log
)t1
WHERE DATE_FORMAT(dt,'%Y%m')='202111'
)
select dt
,ROUND(
sum(CASE WHEN dt = new_dt and DATEDIFF(next_dt,dt) = 1 THEN 1 ELSE 0 END)
/
sum(CASE WHEN dt = new_dt THEN 1 ELSE 0 END)
,2) uv_left_rate
FROM t2
GROUP BY dt
having uv_left_rate IS NOT NULL
ORDER BY dt;
)