指标说明:
次日留存率=基准日新用户在次日的登录总人数/基准日新增用户数
同理,
三日留存率=基准日新用户在第4天的登录总人数/基准日新增用户数
七日留存率=基准日新用户在第8天的登录总人数/基准日新增用户数
计算留存率问题只需要两个表,两个连接条件:注册表registerinfo 简写为r,登录记录表loginrecordinfo简写为l;条件一,两表id一致。条件二,登录表l的登录时间与注册时间的关心。
下举两个例子计算次日,三日留存率问题。
例子1,注册表与登录表给出。
数据准备
DROP TABLE if exists SQL_11_REG;
CREATE TABLE SQL_11_REG(
uid INT AUTO_INCREMENT PRIMARY KEY,
register_time DATETIME NOT NULL
);
insert into SQL_11_REG(register_time) values ('2020-01-01 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-01 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-01 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-01 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-02 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-02 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-02 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-03 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-03 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-03 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-03 00:01:00');
insert into SQL_11_REG(register_time) values ('2020-01-03 00:01:00');
select * from SQL_11_REG;
DROP TABLE if exists SQL_11_LOGIN;
CREATE TABLE SQL_11_LOGIN(
id INT AUTO_INCREMENT PRIMARY KEY,
uid INT NOT NULL,
login_time DATETIME NOT NULL
);
insert into SQL_11_LOGIN(uid, login_time) values (1, '2020-01-02 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (2, '2020-01-02 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (3, '2020-01-02 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (1, '2020-01-03 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (3, '2020-01-03 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (4, '2020-01-03 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (6, '2020-01-03 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (2, '2020-01-04 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (3, '2020-01-04 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (5, '2020-01-04 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (6, '2020-01-04 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (7, '2020-01-04 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (9, '2020-01-04 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (1, '2020-01-05 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (6, '2020-01-05 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (7, '2020-01-05 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (9, '2020-01-05 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (10, '2020-01-05 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (11, '2020-01-05 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (4, '2020-01-06 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (11, '2020-01-06 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (12, '2020-01-06 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (1, '2020-01-07 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (2, '2020-01-07 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (6, '2020-01-07 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (8, '2020-01-07 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (9, '2020-01-07 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (2, '2020-01-08 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (10, '2020-01-08 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (12, '2020-01-08 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (3, '2020-01-09 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (8, '2020-01-09 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (9, '2020-01-09 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (11, '2020-01-09 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (4, '2020-01-10 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (6, '2020-01-10 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (7, '2020-01-10 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (9, '2020-01-10 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (10, '2020-01-10 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (12, '2020-01-10 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (6, '2020-01-11 00:02:00');
insert into SQL_11_LOGIN(uid, login_time) values (12, '2020-01-11 00:02:00');
select * from SQL_11_LOGIN;
注册表如下:
登录表如下:
这里用上了两个关键的连接条件,r.uid=l.uid和l.login_time between r.register_time+interval 1 day and r.register_time+interval 7 day。
date()函数分离并返回时间数据的年月日部分。
这里的左右与left(l.login_time,10) as lt一致。
datediff()函数只计算时间数据的年月日部分的差值
第一步,得到uid,rt,lt,tdiff格式的表
with t1 as(
select r.uid,
date(r.register_time) as rt,
left(l.login_time,10) as lt,
datediff(l.login_time,r.register_time) as tdiff
from sql_11_reg as r
left join sql_11_login as l
on r.uid=l.uid
and date(l.login_time) between date(r.register_time)+interval 1 day and date(r.register_time)+interval 7 day
)
select * from t1
结果如下:
第二步,根据注册时间分组并利用聚合函数count和casewhen创建新列
with rl as(
select r.uid,
date(r.register_time) as rt,
left(l.login_time,10) as lt,
datediff(l.login_time,r.register_time) as tdiff
from sql_11_reg as r
left join sql_11_login as l
on r.uid=l.uid
and date(l.login_time) between date(r.register_time)+interval 1 day and date(r.register_time)+interval 7 day
)
select rt,
#一般来说注册表应该每个用户都只有一条记录,但是以防万一需要用到distinct
count(distinct uid) as 新增用户 ,
#else null可缺省
count(distinct case when tdiff=1 then uid else null end)/count(distinct uid) as 次日留存率,
count(distinct case when tdiff=3 then uid else null end)/count(distinct uid) as 三日留存率,
count(distinct case when tdiff=7 then uid else null end)/count(distinct uid) as 七日留存率
from rl
group by rt;
结果如下:
例子2,只有登录表给出,那么默认第一次登录为注册时间
数据准备
DROP TABLE if exists SQL_12;
CREATE TABLE SQL_12(
user_id INT ,
login_time DATETIME NOT NULL
);
insert into SQL_12(user_id,login_time) values (1001,'2021-01-01 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-01 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-01 00:01:00');
insert into SQL_12(user_id,login_time) values (1001,'2021-01-02 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-02 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-03 00:01:00');
insert into SQL_12(user_id,login_time) values (1004,'2021-01-03 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-03 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1004,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1007,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1008,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1001,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1004,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1001,'2021-01-07 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-07 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-07 00:01:00');
insert into SQL_12(user_id,login_time) values (1007,'2021-01-07 00:01:00');
select * from SQL_12;
先通过group by user_id取最小的登录日期作为注册日期,来构造注册表r
with r as(
select user_id as uid,
min(date(login_time)) as rt
from sql_12
group by user_id
)
select * from r
继续构造登录表l
with r as(
select user_id as uid,
min(date(login_time)) as rt
from sql_12
group by user_id
),
l as(
select user_id as uid,
date(login_time) as lt
from sql_12
)
select *
from l;
然后就可以像例子一一样解决问题。
DROP TABLE if exists SQL_12;
CREATE TABLE SQL_12(
user_id INT ,
login_time DATETIME NOT NULL
);
insert into SQL_12(user_id,login_time) values (1001,'2021-01-01 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-01 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-01 00:01:00');
insert into SQL_12(user_id,login_time) values (1001,'2021-01-02 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-02 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-03 00:01:00');
insert into SQL_12(user_id,login_time) values (1004,'2021-01-03 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-03 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1004,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1007,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1008,'2021-01-04 00:01:00');
insert into SQL_12(user_id,login_time) values (1001,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1004,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-05 00:01:00');
insert into SQL_12(user_id,login_time) values (1002,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1005,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-06 00:01:00');
insert into SQL_12(user_id,login_time) values (1001,'2021-01-07 00:01:00');
insert into SQL_12(user_id,login_time) values (1003,'2021-01-07 00:01:00');
insert into SQL_12(user_id,login_time) values (1006,'2021-01-07 00:01:00');
insert into SQL_12(user_id,login_time) values (1007,'2021-01-07 00:01:00');
select * from SQL_12;
# 先构造注册表r
with r as(
select user_id as uid,
min(date(login_time)) as rt
from sql_12
group by user_id
),
l as(
select user_id as uid,
date(login_time) as lt
from sql_12
),
rl as(
select r.uid,
r.rt,
l.lt,
datediff(l.lt,r.rt) as difft
from r left join l
on r.uid=l.uid
and
l.lt between r.rt+interval 1 day and r.rt+interval 7 day
)
select
count(distinct uid) 新增用户数,
count(distinct case when difft=1 then uid else null end)/count(distinct uid) as 次日留存率,
count(distinct case when difft=3 then uid else null end)/count(distinct uid) as 三日留存率,
count(distinct case when difft=7 then uid else null end)/count(distinct uid) as 次日留存率
from rl
group by rt
order by rt;
结果如下: