哈喽,小伙伴们,今天给大家讲解一道面试中常见的SQL问题------用户留存问题,希望能帮助到小伙伴们。需要说明的是本人是在MySQL中进行的开发,如果你使用的是Hive,可能有部分函数不兼容。
1、 需求
求用户1日、3日、7日留存率
概念问题:
第N日活跃用户留存率:以基准日的活跃用户数为主,第N日后依然活跃的用户占基准日活跃用户的比例
第1日留存率(即次日留存):(以基准日当天活跃的用户中,基准日之后的第1天还活跃的用户数)/基准日当天总活跃用户数;
第3日留存率:(以基准日当天活跃的用户中,基准日之后的第3天还活跃的用户数)/基准日当天总活跃用户数;
第7日留存率:(以基准日当天活跃的用户中,基准日之后的第7天还活跃的用户数)/基准日当天总活跃用户数。
2、数据准备
1)建表语句
create table user_log(
userid bigint,
time varchar(100)
)ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='用户表';
2)数据装载
INSERT INTO user_log VALUES
(1,'2019-07-11 01:44:27'),
(1,'2019-07-12 02:44:27'),
(1,'2019-07-14 03:44:27'),
(1,'2019-07-16 04:44:27'),
(1,'2019-07-17 05:44:27'),
(1,'2019-07-18 06:44:27'),
(1,'2019-07-19 07:44:27'),
(1,'2019-07-21 08:44:27'),
(2,'2019-07-11 01:44:27'),
(2,'2019-07-12 02:44:27'),
(2,'2019-07-13 04:44:27'),
(2,'2019-07-14 06:44:27'),
(2,'2019-07-15 07:44:27'),
(2,'2019-07-16 12:44:27'),
(2,'2019-07-17 13:44:27'),
(2,'2019-07-18 23:44:27'),
(3,'2019-07-11 02:44:27'),
(3,'2019-07-13 03:44:27'),
(3,'2019-07-14 06:44:27'),
(3,'2019-07-17 07:44:27'),
(3,'2019-07-19 09:44:27'),
(3,'2019-07-20 10:44:27'),
(3,'2019-07-21 11:44:27'),
(3,'2019-07-22 12:44:27');
3、代码实现
1)进行表的自身关联,过滤出右表日期大于左表日期的数据
SELECT t1.userid userid
,t1.time start_time
,t2.time end_time
,DATEDIFF(t2.time,t1.time) AS diff_date
FROM (
SELECT userid
,DATE_FORMAT(`time`,'%Y-%m-%d') `time`
FROM user_log
GROUP BY userid
,DATE_FORMAT(`time`,'%Y-%m-%d')
) t1
LEFT JOIN (
SELECT userid
,DATE_FORMAT(`time`,'%Y-%m-%d') `time`
FROM user_log
GROUP BY userid
,DATE_FORMAT(`time`,'%Y-%m-%d')
) t2
ON t1.userid = t2.userid
WHERE t1.time <= t2.time;
2)根据行为日期差计算出x日留存用户数,核心就是date_diff函数使用
SELECT start_time
,COUNT(userid) AS base_cnt
,COUNT(DISTINCT IF(diff_date = 1,userid,NULL)) AS remain_1d
,COUNT(DISTINCT IF(diff_date = 3,userid,NULL)) AS remain_3d
,COUNT(DISTINCT IF(diff_date = 7,userid,NULL)) AS remain_7d
FROM (
SELECT t1.userid userid
,t1.time start_time
,t2.time end_time
,DATEDIFF(t2.time,t1.time) AS diff_date
FROM (
SELECT userid
,DATE_FORMAT(`time`,'%Y-%m-%d') `time`
FROM user_log
GROUP BY userid
,DATE_FORMAT(`time`,'%Y-%m-%d')
) t1
LEFT JOIN (
SELECT userid
,DATE_FORMAT(`time`,'%Y-%m-%d') `time`
FROM user_log
GROUP BY userid
,DATE_FORMAT(`time`,'%Y-%m-%d')
) t2
ON t1.userid = t2.userid
WHERE t1.time <= t2.time
) t3
GROUP BY start_time;
3) 计算用户留存率,其中x日留存率 = x日留存用户数/基准日活跃用户数
SELECT start_time
,base_cnt
,remain_1d
,remain_3d
,remain_7d
,round(remain_1d / base_cnt,2) AS remain_1d_rate
,round(remain_3d / base_cnt,2) AS remain_3d_rate
,round(remain_7d / base_cnt,2) AS remain_7d_rate
FROM (
SELECT start_time
,COUNT(userid) AS base_cnt
,COUNT(DISTINCT IF(diff_date = 1,userid,NULL)) AS remain_1d
,COUNT(DISTINCT IF(diff_date = 3,userid,NULL)) AS remain_3d
,COUNT(DISTINCT IF(diff_date = 7,userid,NULL)) AS remain_7d
FROM (
SELECT t1.userid userid
,t1.time start_time
,t2.time end_time
,DATEDIFF(t2.time,t1.time) AS diff_date
FROM (
SELECT userid
,DATE_FORMAT(`time`,'%Y-%m-%d') `time`
FROM user_log
GROUP BY userid
,DATE_FORMAT(`time`,'%Y-%m-%d')
) t1
LEFT JOIN (
SELECT userid
,DATE_FORMAT(`time`,'%Y-%m-%d') `time`
FROM user_log
GROUP BY userid
,DATE_FORMAT(`time`,'%Y-%m-%d')
) t2
ON t1.userid = t2.userid
WHERE t1.time <= t2.time
) t3
GROUP BY start_time
) t4
;