用户留存
- 用户登陆表
CREATE TABLE IF NOT EXISTS login_log(
user_id INT ,
log_time TIMESTAMP
) engine=MergeTree() order by user_id;
- 数据
INSERT INTO login_log VALUES
(1101,'2021-01-21 22:00:00'),
(1101,'2021-01-20 22:00:00'),
(1101,'2021-01-19 22:00:00'),
(1101,'2021-01-17 22:00:00'),
(1101,'2021-01-16 22:00:00'),
(1101,'2021-01-21 22:00:00'),
(1101,'2021-01-21 23:00:00'),
(1101,'2021-01-20 23:00:00'),
(1101,'2021-01-19 23:00:00'),
(1101,'2021-01-17 23:00:00'),
(1101,'2021-01-16 23:00:00'),
(1101,'2021-01-21 23:00:00'),
(4101,'2021-01-20 22:00:00'),
(4101,'2021-01-19 22:00:00'),
(4101,'2021-01-17 22:00:00'),
(4101,'2021-01-16 22:00:00'),
(2201,'2021-01-16 14:00:00'),
(2201,'2021-01-15 23:04:00'),
(2201,'2021-01-21 18:00:00'),
(2201,'2021-01-20 21:00:00'),
(2201,'2021-01-21 23:00:00'),
(2201,'2021-01-20 23:00:00'),
(3301,'2021-01-21 22:00:00'),
(3301,'2021-01-19 22:00:00'),
(3301,'2021-01-18 23:00:00'),
(3301,'2021-01-17 23:00:00'),
(3301,'2021-01-16 23:00:00'),
(3301,'2021-01-15 23:00:00');
- 如需查询用户在2021-01-17至2021-01-21登录流失情况,则SQL语句如下:
SELECT
user_id,
retention(
date(log_time) = '2021-01-21',
date(log_time) = '2021-01-20',
date(log_time) = '2021-01-19',
date(log_time) = '2021-01-18',
date(log_time) = '2021-01-17'
) AS r
FROM login_log
GROUP BY user_id
ORDER BY user_id ASC;
user_id|r |
-------|-----------|
1101|[1,1,1,0,1]|
2201|[1,1,0,0,0]|
3301|[1,0,1,1,1]|
4101|[0,0,0,0,0]|
- 查询留存
SELECT
DATE(TIMESTAMP '2021-01-15 00:00:00') AS first_date,
SUM(r[1]) AS "第一天活跃用户",
SUM(r[2])/ SUM(r[1]) AS "次日留存",
SUM(r[3])/ SUM(r[1]) AS "3日留存",
SUM(r[4])/ SUM(r[1]) AS "7日留存"
FROM
-- 计算2021-01-15活跃用户在第2、3、7日的登录情况,1/0 => 登录/未登录
(
WITH first_day_table AS (
SELECT
TIMESTAMP '2021-01-15 00:00:00' AS first_day
)
SELECT
user_id,
retention(
DATE(log_time) = (SELECT DATE(first_day) FROM first_day_table),
DATE(log_time) = (SELECT DATE(first_day + INTERVAL '1 day') FROM first_day_table),
DATE(log_time) = (SELECT DATE(first_day + INTERVAL '2 day') FROM first_day_table),
DATE(log_time) = (SELECT DATE(first_day + INTERVAL '6 day') FROM first_day_table)
) AS r
-- 过滤2021-01-15活跃用户在后续 1~7 日登录数据
FROM
login_log
WHERE
(
log_time >= TIMESTAMP '2021-01-15 00:00:00'
)
AND (
log_time <= TIMESTAMP '2021-01-15 00:00:00' + INTERVAL '6 day'
)
GROUP BY
user_id
) AS basic_table
GROUP BY
first_date;
first_date|第一天活跃用户|次日留存|3日留存|7日留存|
----------|-------|----|----|----|
2021-01-15| 2| 1.0| 0.5| 0.0|
- 函数分析
语法:retention(cond1, cond2, ..., cond32);
// 示例
r1: date=2020-01-01( cond1 条件)。
r2: 2020-01-01=<date<=2020-01-02 (cond1 和 cond2 条件)。
r3: 2020-01-01=<date<=2020-01-03 (cond1 和 cond3 条件)。
retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') as r
行为漏斗转化
- 用户行为表
CREATE TABLE IF NOT EXISTS user_action
(
`uid` Int32,
`event_type` String,
`time` datetime
)
ENGINE = MergeTree()
ORDER BY uid;
- 数据
insert into user_action values
(1,'浏览','2021-01-02 11:00:00'),
(1,'点击','2021-01-02 11:10:00'),
(1,'下单','2021-01-02 11:20:00'),
(1,'支付','2021-01-02 11:30:00'),
(2,'下单','2021-01-02 11:00:00'),
(2,'支付','2021-01-02 11:10:00'),
(1,'浏览','2021-01-02 11:00:00'),
(3,'浏览','2021-01-02 11:20:00'),
(3,'点击','2021-01-02 12:00:00'),
(4,'浏览','2021-01-02 11:50:00'),
(4,'点击','2021-01-02 12:00:00'),
(5,'浏览','2021-01-02 11:50:00'),
(5,'点击','2021-01-02 12:00:00'),
(5,'下单','2021-01-02 11:10:00'),
(6,'浏览','2021-01-02 11:50:00'),
(6,'点击','2021-01-02 12:00:00'),
(6,'下单','2021-01-02 12:10:00');
- 30分钟作为一个时间窗口,查看窗口数据
SELECT
user_id,
windowFunnel(1800)(time, event_type = '浏览', event_type = '点击', event_type = '下单', event_type = '支付') AS level
FROM
(
SELECT
time,
event_type,
uid AS user_id
FROM action
)
GROUP BY user_id
user_id|level|
-------|-----|
4| 2|
3| 1|
2| 0|
5| 2|
1| 4|
6| 3|
- 分析"2021-01-02"这天 路径为“浏览->点击->下单->支付”的转化情况
SELECT level_index,count(1) FROM
(
SELECT user_id,
arrayWithConstant(level, 1) levels,
arrayJoin(arrayEnumerate( levels )) level_index
FROM (
SELECT
user_id,
windowFunnel(1800)(
time,
event_type = '浏览',
event_type = '点击' ,
event_type = '下单',
event_type = '支付'
) AS level
FROM (
SELECT time, event_type , uid as user_id
FROM user_action
WHERE toDate(time) = '2020-01-02'
)
GROUP BY user_id
)
)
group by level_index
ORDER BY level_index
// 内部数据
user_id|levels |level_index|
-------|---------|-----------|
4|[1,1] | 1|
4|[1,1] | 2|
3|1 | 1|
5|[1,1] | 1|
5|[1,1] | 2|
1|[1,1,1,1]| 1|
1|[1,1,1,1]| 2|
1|[1,1,1,1]| 3|
1|[1,1,1,1]| 4|
6|[1,1,1] | 1|
6|[1,1,1] | 2|
6|[1,1,1] | 3|
// 结果
level_index|count(1)|
-----------|--------|
1| 5|
2| 4|
3| 2|
4| 1|
路径分析
- 关键路径分析:已经明确了要分析的路径,需要看下这些访问路径上的用户数据
- 智能路径分析:不确定有哪些路径,但是清楚目标路径是什么,需要知道用户在指定时间范围内都是通过哪些途径触达目标路径的
下单超过10分钟才支付
--明确了要分析的路径
SELECT
count(1) AS "userCount",
sum(cn) AS "actionCount"
FROM
(
SELECT
uid,
sequenceCount('(?1)(?t>=600)(?2)')(toDateTime(time), event_type = '下单', event_type = '支付') AS cn
FROM user_action
GROUP BY uid
)
WHERE cn >= 1
--智能路径分析