用户活跃模型表. 表名:tmp_dau_based
字段解释:
- imp_date 日期 int格式.
- qimei 用户唯一标识(相当于用户id) int格式 无空值.
- is_new 新用户标识 int格式. (1标识新用户 0标识老用户)
备注:一个用户1天只出现1次,出现即表示当日登陆。
红包参与模型表. 表名:tmp_packet_based
字段解释:
- imp_date 日期 int格式.
- qimei 用户唯一标识(相当于用户id) int格式 无空值.
- report_time 领取时间戳 int格式.
- add_money 领取金额 int格式.(单位为分 无空值或0值)
备注:日志流水表,每一行为领取1次红包。无特殊说明,一般不考虑领取红包但当日未登录的情况。
(为什么会出现“已领取红包,但未登录的情况”,因为在现实的数据中,数据上报无法做到十分准确,这种其实是BUG数据。需要开发完善上报。)
-- 36、计算2019年6月1日至6月20日每日DAU
SELECT imp_date
,COUNT(DISTINCT qimei) as dau
FROM tmp_dau_based
WHERE imp_date BETWEEN '20190601'and '20190620'
GROUP BY imp_date;
-- 37、按新老用户区分,计算2019年3月1日至3月10日,每天领取红包的用户数,人均领取金额,人均领取次数
SELECT
t1.imp_date
,t2.is_new
,COUNT( DISTINCT t1.qimei ) AS user_cnt
,AVG(t1.packet_amount ) AS avg_packet_amount
,AVG(t1.packet_times) AS avg_packet_times
FROM
( -- 计算每位用户每天的领取金额及领取次数
SELECT imp_date
,qimei
,SUM( add_money ) AS packet_amount
,COUNT(1) AS packet_times
FROM tmp_packet_based
WHERE imp_date BETWEEN '20190301' AND '20190310'
GROUP BY imp_date
,qimei
) t1
LEFT JOIN
(
SELECT imp_date
,qimei
,is_new
FROM tmp_dau_based
WHERE imp_date BETWEEN '20190301' AND '20190310'
) t2
ON t1.imp_date = t2.imp_date AND t1.qimei = t2.qimei
GROUP BY t1.imp_date
,t2.is_new
ORDER BY t1.imp_date
,t2.is_new;
# 38. 计算2019年3月至6月,每个月领取过红包用户和未领取过红包用户的数量、平均月活跃天数(既本月平均活跃多少天)
-- 恭喜您,答对了 !!!
/*
-- 逻辑梳理:
维度:每个月(时间标签)、是否领取红包(属性标签)
指标:活跃用户数、平均活跃天数( 1. 先计算每个自然月、每个用户的活跃天数; 2. 只保留自然月维度,计算平均天数)
-- 解题思路:
1.按天看人均领取金额和人均领取次数,所以要算出每天每人的领取金额和领取次数,然后再做平均。
2.还需按每天的新老用户进行区分,所以在a表里面取出每个用户是否是新老用户的字段,然后a表和b表关联,最后再聚合。
*/
SELECT imp_month
, is_received
, COUNT(DISTINCT qimei) as user_cnt
, AVG(per_user_active_days) as avg_active_days
FROM
(
-- 每月每位活跃用户是否领取红包、以及活跃天数标签
SELECT t1.imp_month
, t1.qimei
, t1.per_user_active_days
, IF(t2.imp_month IS NULL , 0, 1) AS is_received
FROM
( -- 计算活跃用户每月每位用户的活跃天数
SELECT DATE_FORMAT( imp_date,'%Y-%m') as imp_month
,qimei
,COUNT(DISTINCT imp_date) AS per_user_active_days
FROM tmp_dau_based
WHERE imp_date >= '20190301' AND imp_date < '20190701'
GROUP BY DATE_FORMAT( imp_date,'%Y-%m')
,qimei
) t1
LEFT JOIN
(
-- 红包领取日志表中的按每月、每位用户需进行去重操作
SELECT DATE_FORMAT( imp_date,'%Y-%m') as imp_month
,qimei
FROM tmp_packet_based
WHERE imp_date >= '20190301' AND imp_date < '20190701'
GROUP BY DATE_FORMAT( imp_date,'%Y-%m')
,qimei
) t2
ON t1.imp_month = t2.imp_month AND t1.qimei = t2.qimei
) t
GROUP BY imp_month
,is_received
ORDER BY imp_month
,is_received;
-- 39.计算2019年3月6日所有老用户领取的第一个红包金额,若用户没有领取红包则金额为0
/*
所有老用户: is_new = 0
领取的第一个红包金额: ①第一个红包 ②金额
若用户没有领取红包则金额为0
*/
SELECT t1.imp_date
,t1.qimei
,IFNULL(t2.add_money,0) AS packet_amount
FROM
(
-- 2019年3月6日所有老用户
SELECT qimei, imp_date
FROM tmp_dau_based
WHERE imp_date = '20190306'
AND is_new = 0
) t1
LEFT JOIN
(
-- 领取的第一个红包金额
SELECT qimei , add_money
FROM
(
SELECT qimei
, add_money
, row_number()over( PARTITION BY qimei ORDER BY report_time asc ) as rn
FROM tmp_packet_based
WHERE imp_date = '20190306'
) t
WHERE rn = 1
) t2
ON t1.qimei = t2.qimei ;
-- 40.计算2019年6月1日至6月10日,每日领取红包的新用户数、老用户数及其人均领取金额,人均领取次数,要考虑【领取红包但当日未登录】的情况
SELECT
t1.imp_date
-- 2 代表领取红包但当日未登录
,ifnull(t2.is_new,2) AS is_new
,COUNT( DISTINCT t1.qimei ) AS user_cnt
,AVG(t1.packet_amount ) AS avg_packet_amount
,AVG(t1.packet_times) AS avg_packet_times
FROM
( -- 计算每位用户每天的领取金额及领取次数
SELECT imp_date
,qimei
,SUM( add_money ) AS packet_amount
,COUNT(1) AS packet_times
FROM tmp_packet_based
WHERE imp_date BETWEEN '20190601' AND '20190610'
GROUP BY imp_date
,qimei
) t1
LEFT JOIN
(
SELECT imp_date
,qimei
,is_new
FROM tmp_dau_based
WHERE imp_date BETWEEN '20190601' AND '20190610'
) t2
ON t1.imp_date = t2.imp_date AND t1.qimei = t2.qimei
GROUP BY t1.imp_date
,t2.is_new
ORDER BY t1.imp_date
,t2.is_new;
-- 41.计算2019年5月1日,每个新用户领取的第一个红包和第二个红包的时间差(只计算注册当日有领取红包的用户,注册当日以DAU表中新用户为1的用户)
-- UNIX时间戳转换为日期用函数: FROM_UNIXTIME()
/*
-- 每个新用户领取情况
注意:每个表的详细级别
难点:怎么筛出来每位用户,第一个红包,第二个红包的数据
*/
-- 解法1:简洁,通过
SELECT t1.imp_date
,t1.qimei
,min(report_time) as first_action_time
,max(report_time) as second_action_time
,TIMESTAMPDIFF(second , min(report_time), max(report_time)) AS time_interval
FROM
(
SELECT imp_date
,qimei
FROM tmp_dau_based
WHERE imp_date = '20190501' AND is_new = 1
) t1
INNER JOIN
(
SELECT qimei, report_time
FROM
(
SELECT imp_date
,qimei
,FROM_UNIXTIME(report_time) report_time
,row_number()over(PARTITION BY qimei ORDER BY report_time ASC) AS rn
FROM tmp_packet_based
WHERE imp_date = '20190501'
) t
WHERE rn < 3
) t2
ON t1.qimei = t2.qimei
GROUP BY t1.imp_date
,t1.qimei;
-- 解法2:略显冗余, 不通过(实际是对的)
SELECT *
FROM
(
SELECT t1.imp_date
,t1.qimei
,report_time rn1
,lead(report_time,1)over(PARTITION BY t1.qimei ORDER BY report_time ASC) rn2
,TIMESTAMPDIFF(second ,report_time,lead(report_time,1)over(PARTITION BY t1.qimei ORDER BY report_time ASC) ) AS time_interval
FROM
(
SELECT imp_date
,qimei
FROM tmp_dau_based
WHERE imp_date = '20190501' AND is_new = 1
) t1
INNER JOIN
(
SELECT qimei, report_time
FROM
(
SELECT imp_date
,qimei
,FROM_UNIXTIME(report_time) report_time
,row_number()over(PARTITION BY qimei ORDER BY report_time ASC) rn
FROM tmp_packet_based
WHERE imp_date = '20190501'
) t
WHERE rn < 3
) t2
ON t1.qimei = t2.qimei
) t
WHERE t. rn2 is not null;
-- 42. 计算2019年6月1日至6月20日,每日的用户次日留存率、领取红包用户的次日留存率、未领取红包用户的次日留存率
# 难点:对于未领取、领取红包用户如何筛选计算,以及如何减少不必要的sql代码冗余
SELECT t1.imp_date
,COUNT(DISTINCT t2.qimei) / COUNT(DISTINCT t1.qimei) AS retention
,COUNT(DISTINCT CASE WHEN is_packet_user = 1 THEN t2.qimei END) / COUNT(DISTINCT CASE WHEN is_packet_user = 1 THEN t1.qimei END) AS packet_retention
,COUNT(DISTINCT CASE WHEN is_packet_user = 0 THEN t2.qimei END) / COUNT(DISTINCT CASE WHEN is_packet_user = 0 THEN t1.qimei END) AS non_packet_retention
FROM
(
-- 对每位活跃用户在每天是否领取红包,打上标签,0 表示未领取,1表示领取
SELECT a.imp_date
,a.qimei
,IF(b.qimei IS NULL, 0, 1) AS is_packet_user
FROM( -- 建议优化:缩小数据范围
SELECT DISTINCT imp_date,qimei
FROM tmp_dau_based
WHERE imp_date BETWEEN '20190601' AND '20190620'
) a
LEFT JOIN (
-- 对红包参与模型表进行每用户按日去重
SELECT imp_date
,qimei
FROM tmp_packet_based
WHERE imp_date BETWEEN '20190601' AND '20190620'
GROUP BY imp_date
,qimei
) b ON a.imp_date = b.imp_date AND a.qimei = b.qimei
) t1
LEFT JOIN
(
SELECT imp_date
,qimei
FROM tmp_dau_based
WHERE imp_date BETWEEN '20190602' AND '20190621'
) t2
ON t1.imp_date = (t2.imp_date - 1) AND t1.qimei = t2.qimei
GROUP BY t1.imp_date;
-- dathon参考答案:相对更简洁
with a as (
select distinct imp_date,qimei
from tmp_dau_based
where imp_date between '20190601' and '20190621'
),b as ( select distinct imp_date, qimei
from tmp_packet_based
where imp_date between '20190601' and '20190620'
)
select
t1.imp_date,
count( distinct t2.qimei) / count(distinct t1.qimei) retention,
count( distinct if(t1.is_packet_user = 1, t2.qimei, null)) / count(distinct if(t1.is_packet_user = 1, t1.qimei, null)) packet_retention,
count( distinct if(t1.is_packet_user = 0, t2.qimei, null)) / count( distinct if(t1.is_packet_user = 0, t1.qimei, null)) non_packet_retention
from
( -- t1 表找出每日的登录用户并标记用户是否领取红包,作为留存率的分母
select a.imp_date,
a.qimei,
if(b.qimei is null, 0, 1) is_packet_user
from a
left join b
on a.imp_date = b.imp_date and a.qimei = b.qimei
where a.imp_date between '20190601' and '20190620'
group by a.imp_date,
a.qimei
) t1
-- 当日登录的用户有多少在次日仍然登录,需要对登录表进行连接操作,即t1 left join t2 ( t2 即临时表a )
left join a t2
on t1.qimei = t2.qimei and datediff(t2.imp_date, t1.imp_date) = 1 -- 在dathon的基础上进行了代码更正
group by t1.imp_date;
-- 43. 计算2019年6月1日至6月20日,每日DAU中,近3天连续登陆用户数,近3天有领取红包用户数,近3天连续登陆用户的占比,近3天有领取红包用户的占比
/*
近3天连续登陆用户数
近3天有领取红包用户数
选定时间范围: 20190601~ 20190620
难点:连续登录问题通用解法思路
*/
-- dathon:
SELECT a.imp_date
,COUNT(DISTINCT a.qimei) AS dau
,COUNT(DISTINCT b.qimei) AS 3d_user_cnt
,COUNT(DISTINCT c.qimei) AS 3d_packet_cnt
,COUNT(DISTINCT b.qimei) / COUNT(DISTINCT a.qimei) AS 3d_user_rate
,COUNT(DISTINCT c.qimei) / COUNT(DISTINCT a.qimei) AS 3d_packet_rate
FROM (
-- a表求出每天登录的用户
SELECT imp_date
,qimei
FROM tmp_dau_based
WHERE imp_date BETWEEN '20190601' AND '20190620'
) a
LEFT JOIN (
-- b表中,将用户的登录日减去其登录日的排序即可得到date_base字段,相同的date_base出现几次即意味着连续登陆了几天
SELECT qimei
,date_add(date_base, INTERVAL 3 DAY) AS dt
,COUNT(1) AS cnt
FROM(
SELECT qimei
,imp_date
,rank() OVER (PARTITION BY qimei ORDER BY imp_date) AS date_rank
,date_sub(imp_date, INTERVAL (rank() OVER (PARTITION BY qimei ORDER BY imp_date) ) DAY) AS date_base
FROM tmp_dau_based
WHERE imp_date BETWEEN '20190530' AND '20190620'
) t
GROUP BY qimei
,date_add(date_base, INTERVAL 3 DAY)
HAVING COUNT(1) >= 3
) b
ON a.qimei = b.qimei AND date(a.imp_date) = b.dt
LEFT JOIN (
-- c表求取出1-20日领取红包的用户,在表连接的时候限制在近3天即可表示3天内有领取红包的用户
SELECT qimei, imp_date
FROM tmp_packet_based
WHERE imp_date BETWEEN '20190530' AND '20190620'
GROUP BY qimei
,imp_date
) c
ON a.qimei = c.qimei AND (( 0 < datediff(a.imp_date, c.imp_date) ) AND ( datediff(a.imp_date, c.imp_date) <=2))
GROUP BY a.imp_date;
-- 草稿:
SELECT 0 < datediff('20190601', '20190610') <= 2; # 大在前, 小在后;
-- 44. 计算2019年6月1日,领取红包用户领取金额的中位数(用户粒度的中位数,而非红包粒度的中位数)最高的20个用户及排名
SELECT
imp_date
,qimei
,median_money
,rn
FROM
(
SELECT a.imp_date
,a.qimei
,CAST(FORMAT(add_money,4) AS DECIMAL(18, 4)) AS median_money
,rank() over ( PARTITION BY a.imp_date ORDER BY add_money DESC ) rn
FROM(
-- 计算每个用户领取红包的排名
SELECT
imp_date
,qimei
,add_money
,row_number() over ( PARTITION BY imp_date, qimei ORDER BY add_money ) AS rk
FROM tmp_packet_based
WHERE imp_date = '20190601'
) a
LEFT JOIN (
-- 计算每个用户领取红包的个数
SELECT imp_date
,qimei
,count(*) AS num
FROM tmp_packet_based
WHERE imp_date = '20190601'
GROUP BY imp_date
,qimei
) b
ON a.qimei = b.qimei AND a.imp_date = b.imp_date
-- 根据中位数的定义,其rk排名会落在总数+1再除以二的正负0.5之间
WHERE rk >= ( num + 1 ) / 2 - 0.5 AND rk <= ( num + 1 ) / 2 + 0.5
) t
WHERE rn <= 20;
-- 以字符串形式返回的,然后使用 CAST 函数将其转换回数值类型
SELECT CAST(FORMAT(20,4) AS DECIMAL(18, 4));