SQL记录

1.确定关注行为的激活周期

-- 取6月注册新用户的用户id和注册时间
select user_id,
       date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30'
order by date; 
-- 看6月注册新用户数据,与事件分析查询一致
select count(distinct user_id)
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30';



在这里插入图片描述
在这里插入图片描述
或许这里发生关注行为不应该限定在6月份,6月注册不一定是什么时候发生关注行为,但是根据数据来看,后面的数据量影响不大。

-- 利用6月注册新用户和6月发生关注行为的用户做关联,找到注册新用户的首次关注行为发生的时间
select t1.user_id,
       min(t2.date)
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id;
-- 利用事件分析验证取数逻辑
select count(distinct t1.user_id)      
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'
and t2.date between '2020-06-01' and '2020-06-30'
;

在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

-- 将两个临时表关联,用两个时间相减,看到不同用户的时间间隔

select t3.user_id,datediff(t4.date2,t3.date)
from (select user_id,
       date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id;


-- 按照时间间隔分组,计算用户数
select date_diff,count(distinct t5.user_id)
from
(
select t3.user_id,datediff(t4.date2,t3.date) as date_diff
from (select user_id,
       date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id
) t5
group by date_diff
order by count(distinct t5.user_id) desc;

出现负数,排查原因,抽取一个用户id查看。


-- 查询原因:为啥出现负数,有多个is_success=1 and is_register=1的行为,注册成功行为不准确
select event,user_id,distinct_id, date,$device_id, $is_login_id, $track_signup_original_id,is_success, is_register
from events
where date between '2020-06-01' and '2020-06-30'
and user_id=-1357271717878251552
and event in ('login_register','following_user')
order by date;
-- 查询用户所有行为序列
select event,user_id,distinct_id, date,$device_id, $is_login_id, $track_signup_original_id,is_success, is_register
from events
where user_id=-1357271717878251552
and event='login_register'
and is_success=1
and is_register=1
order by date;

在这里插入图片描述
把date都取出来,方便查看,排序去掉负数。

-- 将两个临时表关联,用两个时间相减,看到不同用户的时间间隔

select t3.user_id,t3.date,t4.date2,datediff(t4.date2,t3.date) as date_diff
from (select user_id,
             date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id
where datediff(t4.date2,t3.date)>=0
order by date_diff ;

在这里插入图片描述

-- 按照时间间隔分组,计算用户数
select t5.date_diff,count(distinct t5.user_id)
from
(
select t3.user_id,t3.date,t4.date2,datediff(t4.date2,t3.date) as date_diff
from (select user_id,
             date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id
where datediff(t4.date2,t3.date)>=0
) t5
group by date_diff
order by count(distinct t5.user_id) desc;

在这里插入图片描述
在这里插入图片描述

2.确定浏览主页行为的激活周期

替换表2的事件和属性

select t5.date_diff,count(distinct t5.user_id)
from
(
select t3.user_id,t3.date,t4.date2,datediff(t4.date2,t3.date) as date_diff
from (select user_id,
             date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='AppPageView'
and t2.$title in ('个人详情动态','个人详情信息') 
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id
where datediff(t4.date2,t3.date)>=0
) t5
group by date_diff
order by count(distinct t5.user_id) desc;

3.确定发消息的激活周期

select t5.date_diff,count(distinct t5.user_id)
from
(
select t3.user_id,t3.date,t4.date2,datediff(t4.date2,t3.date) as date_diff
from (select user_id,
             date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='message_event' 
and t2.message_event_type='发'
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id
where datediff(t4.date2,t3.date)>=0
) t5
group by date_diff
order by count(distinct t5.user_id) desc;

4.确定收消息的激活周期

属性改为【收】
在这里插入图片描述

5.确定进入直播间和观看直播的激活周期

event=‘enter_live_room’

6.确定成功上麦的激活周期

event=‘connect_mic’

7.确定浏览动态的激活周期

event=‘moment_profile_view’

2.计算留存率

select avg(t8.firstday),
       avg(t8.secondday),
       avg(t8.thirdday),
       avg(t8.fourthday),
       avg(t8.fifthday),
       avg(t8.sixthday),
       avg(t8.seventhday),
       avg(t8.eighthday),
       avg(t8.ninthday),
       avg(t8.tenthday),
       avg(t8.eleventhday),
       avg(t8.twelfthday),
       avg(t8.thirteenthday),
       avg(t8.fourteenthday),
       avg(t8.fifteenthday),
       avg(t8.sixteenthday),
       avg(t8.seventeenthday),
       avg(t8.eighteenthday),
       avg(t8.ninteenthday),
       avg(t8.twentiesthday),
       avg(t8.twentyoneday),
       avg(t8.twentytwoday),
       avg(t8.twentythreeday),
       avg(t8.twentyfourday),
       avg(t8.twentyfiveday),
       avg(t8.twentysixday),
       avg(t8.twentysevenday),
       avg(t8.twentyeightday),
       avg(t8.twentynineday),
       avg(t8.thirtyday)
from
(
select t6.date2,
       count(distinct t6.user_id) total_user, 
       sum(datediff(t7.date,t6.date2)=1)/count(distinct t6.user_id) firstday,
       sum(datediff(t7.date,t6.date2)=2)/count(distinct t6.user_id) secondday,
       sum(datediff(t7.date,t6.date2)=3)/count(distinct t6.user_id) thirdday,
       sum(datediff(t7.date,t6.date2)=4)/count(distinct t6.user_id) fourthday,
       sum(datediff(t7.date,t6.date2)=5)/count(distinct t6.user_id) fifthday,
       sum(datediff(t7.date,t6.date2)=6)/count(distinct t6.user_id) sixthday,
       sum(datediff(t7.date,t6.date2)=7)/count(distinct t6.user_id) seventhday,
       sum(datediff(t7.date,t6.date2)=8)/count(distinct t6.user_id) eighthday,
       sum(datediff(t7.date,t6.date2)=9)/count(distinct t6.user_id) ninthday,
       sum(datediff(t7.date,t6.date2)=10)/count(distinct t6.user_id) tenthday,
       sum(datediff(t7.date,t6.date2)=11)/count(distinct t6.user_id) eleventhday,
       sum(datediff(t7.date,t6.date2)=12)/count(distinct t6.user_id) twelfthday,
       sum(datediff(t7.date,t6.date2)=13)/count(distinct t6.user_id) thirteenthday,
       sum(datediff(t7.date,t6.date2)=14)/count(distinct t6.user_id) fourteenthday,
       sum(datediff(t7.date,t6.date2)=15)/count(distinct t6.user_id) fifteenthday,
       sum(datediff(t7.date,t6.date2)=16)/count(distinct t6.user_id) sixteenthday,
       sum(datediff(t7.date,t6.date2)=17)/count(distinct t6.user_id) seventeenthday,
       sum(datediff(t7.date,t6.date2)=18)/count(distinct t6.user_id) eighteenthday,
       sum(datediff(t7.date,t6.date2)=19)/count(distinct t6.user_id) ninteenthday,
       sum(datediff(t7.date,t6.date2)=20)/count(distinct t6.user_id) twentiesthday,
       sum(datediff(t7.date,t6.date2)=21)/count(distinct t6.user_id) twentyoneday,
       sum(datediff(t7.date,t6.date2)=22)/count(distinct t6.user_id) twentytwoday,
       sum(datediff(t7.date,t6.date2)=23)/count(distinct t6.user_id) twentythreeday,
       sum(datediff(t7.date,t6.date2)=24)/count(distinct t6.user_id) twentyfourday,
       sum(datediff(t7.date,t6.date2)=25)/count(distinct t6.user_id) twentyfiveday,
       sum(datediff(t7.date,t6.date2)=26)/count(distinct t6.user_id) twentysixday,
       sum(datediff(t7.date,t6.date2)=27)/count(distinct t6.user_id) twentysevenday,
       sum(datediff(t7.date,t6.date2)=28)/count(distinct t6.user_id) twentyeightday,
       sum(datediff(t7.date,t6.date2)=29)/count(distinct t6.user_id) twentynineday,
       sum(datediff(t7.date,t6.date2)=30)/count(distinct t6.user_id) thirtyday
from
(
select t3.user_id,t4.date2,datediff(t4.date2,t3.date)
from (select user_id,
       date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'-- 变量
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id
having datediff(t4.date2,t3.date) between 0 and 3
) t6
left join 
(
select distinct t5.user_id,t5.date
from events t5 
where event='$AppStart'
and (date between '2020-06-01' and '2020-07-30')
and user_id in (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') 
order by t5.date) t7
on t6.user_id=t7.user_id
group by t6.date2
order by t6.date2
) t8;

先将两张表准备好:
用户id,和激活周期(暂定激活周期是3天,datediff在0-3)内【首次发生关键行为】的时间:表t6
注册用户的用户id,后续活跃时间:表t7
t6左关联t7,按照时间分组计算用户【首次发生关键行为】后的留存情况,
再取30日留存的平均值
最后第30日留存,实际数据留存率也有点高,用户活跃度比较高,可能有什么活动或者外部因素,可以忽略影响。

加个第0天

select avg(t8.zeroday),
       avg(t8.firstday),
       avg(t8.secondday),
       avg(t8.thirdday),
       avg(t8.fourthday),
       avg(t8.fifthday),
       avg(t8.sixthday),
       avg(t8.seventhday),
       avg(t8.eighthday),
       avg(t8.ninthday),
       avg(t8.tenthday),
       avg(t8.eleventhday),
       avg(t8.twelfthday),
       avg(t8.thirteenthday),
       avg(t8.fourteenthday),
       avg(t8.fifteenthday),
       avg(t8.sixteenthday),
       avg(t8.seventeenthday),
       avg(t8.eighteenthday),
       avg(t8.ninteenthday),
       avg(t8.twentiesthday),
       avg(t8.twentyoneday),
       avg(t8.twentytwoday),
       avg(t8.twentythreeday),
       avg(t8.twentyfourday),
       avg(t8.twentyfiveday),
       avg(t8.twentysixday),
       avg(t8.twentysevenday),
       avg(t8.twentyeightday),
       avg(t8.twentynineday),
       avg(t8.thirtyday)
from
(
select t6.date2,
       count(distinct t6.user_id) total_user, 
       sum(datediff(t7.date,t6.date2)=0)/count(distinct t6.user_id) zeroday,
       sum(datediff(t7.date,t6.date2)=1)/count(distinct t6.user_id) firstday,
       sum(datediff(t7.date,t6.date2)=2)/count(distinct t6.user_id) secondday,
       sum(datediff(t7.date,t6.date2)=3)/count(distinct t6.user_id) thirdday,
       sum(datediff(t7.date,t6.date2)=4)/count(distinct t6.user_id) fourthday,
       sum(datediff(t7.date,t6.date2)=5)/count(distinct t6.user_id) fifthday,
       sum(datediff(t7.date,t6.date2)=6)/count(distinct t6.user_id) sixthday,
       sum(datediff(t7.date,t6.date2)=7)/count(distinct t6.user_id) seventhday,
       sum(datediff(t7.date,t6.date2)=8)/count(distinct t6.user_id) eighthday,
       sum(datediff(t7.date,t6.date2)=9)/count(distinct t6.user_id) ninthday,
       sum(datediff(t7.date,t6.date2)=10)/count(distinct t6.user_id) tenthday,
       sum(datediff(t7.date,t6.date2)=11)/count(distinct t6.user_id) eleventhday,
       sum(datediff(t7.date,t6.date2)=12)/count(distinct t6.user_id) twelfthday,
       sum(datediff(t7.date,t6.date2)=13)/count(distinct t6.user_id) thirteenthday,
       sum(datediff(t7.date,t6.date2)=14)/count(distinct t6.user_id) fourteenthday,
       sum(datediff(t7.date,t6.date2)=15)/count(distinct t6.user_id) fifteenthday,
       sum(datediff(t7.date,t6.date2)=16)/count(distinct t6.user_id) sixteenthday,
       sum(datediff(t7.date,t6.date2)=17)/count(distinct t6.user_id) seventeenthday,
       sum(datediff(t7.date,t6.date2)=18)/count(distinct t6.user_id) eighteenthday,
       sum(datediff(t7.date,t6.date2)=19)/count(distinct t6.user_id) ninteenthday,
       sum(datediff(t7.date,t6.date2)=20)/count(distinct t6.user_id) twentiesthday,
       sum(datediff(t7.date,t6.date2)=21)/count(distinct t6.user_id) twentyoneday,
       sum(datediff(t7.date,t6.date2)=22)/count(distinct t6.user_id) twentytwoday,
       sum(datediff(t7.date,t6.date2)=23)/count(distinct t6.user_id) twentythreeday,
       sum(datediff(t7.date,t6.date2)=24)/count(distinct t6.user_id) twentyfourday,
       sum(datediff(t7.date,t6.date2)=25)/count(distinct t6.user_id) twentyfiveday,
       sum(datediff(t7.date,t6.date2)=26)/count(distinct t6.user_id) twentysixday,
       sum(datediff(t7.date,t6.date2)=27)/count(distinct t6.user_id) twentysevenday,
       sum(datediff(t7.date,t6.date2)=28)/count(distinct t6.user_id) twentyeightday,
       sum(datediff(t7.date,t6.date2)=29)/count(distinct t6.user_id) twentynineday,
       sum(datediff(t7.date,t6.date2)=30)/count(distinct t6.user_id) thirtyday
from
(
select t3.user_id,t4.date2,datediff(t4.date2,t3.date)
from (select user_id,
       date
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t3
join 
(select t1.user_id,
       min(t2.date) as date2
from (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') t1
join events t2 on t1.user_id=t2.user_id
where t2.event='following_user'-- 变量
and t2.date between '2020-06-01' and '2020-06-30'
group by t1.user_id) t4 on t3.user_id=t4.user_id
having datediff(t4.date2,t3.date) between 0 and 3
) t6 -- 第一张表:用户id和关注时间
left join 
(
select distinct t5.user_id,t5.date
from events t5 
where event='$AppStart'
and (date between '2020-06-01' and '2020-07-30')
and user_id in (select user_id
from events
where event='login_register'
and is_register=1
and is_success=1
and date between '2020-06-01' and '2020-06-30') 
order by t5.date) t7 -- 第二张表:活跃用户id和活跃时间
on t6.user_id=t7.user_id
group by t6.date2
order by t6.date2
) t8;

取数逻辑

1.6月份注册成功用户数
2.激活周期
利用6月份注册成功的用户,计算注册-行为A的时间差计算时长(以天为计算单位)

计算完时长,按照1、2、3天时间间隔来分组计算人数,同时计算累计百分比,找到占比80%的点,定为激活周期的大概时间段
(每一个行为重复计算,利用这个SQL,或者写SQL变量)
多个行为的激活周期都要参考,sql换一下行为就可以了,或者变量换参数,激活周期用来后面寻找魔法数字,要统计激活周期内发生的行为次数。

3.绘制留存曲线

用户先大概看一下:发生注册,且在激活周期内(比如7天窗口期)发生关键行为A的用户有多少,发生注册,且在激活周期内发生关键行为B的用户有多少,都是一样的

针对每一个关键行为:比如在激活周期内,发生关注行为的5w人,在后面每一天发生对应的留存行为:活跃行为,需要计算第一天的留存率,第二天的留存率,第三天的留存率,需要计算30天,横轴为时间(这里取平均数还是加权平均值?)

Sql计算留存率,或者说我们自己的模型计算留存率的公式和逻辑:初始行为是关注,后续行为是活跃,且是激活周期内【首次发生过关注】的这些用户,按照每一天来看30日留存,至此,找到关键的激活行为,之后再去看次数

4.比如激活行为是是关注

需要统计【激活周期】内,这些【发生关注】的用户,具体发生的次数/关注的用户数,按照该维度分组,多少用户
5.发生一次,发生两次,发生三次的人数也得保存下来,需要计算他们对应的的次日/次周留存(次日、次周需要根据用户激活周期来看)因为要画那个曲线,是需要定义魔法数字的。
6.激活周期内发生2次的这批用户,对应的次周留存,但是激活周期取的是3天,用这三天的人数做分母,然后再取对应的下一周的活跃用户,作为分子,激活周期发生两次的这些用户起始时间也是不同的,按照周来分组就行了,然后再计算平均值。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值