走进大数据丨 一条让我虎躯一震的SQL

有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)

日期    用户    年龄

2019-02-11,test_1,23

2019-02-11,test_2,19

2019-02-11,test_3,39

2019-02-11,test_1,23

2019-02-11,test_3,39

2019-02-11,test_1,23

2019-02-12,test_2,19

2019-02-13,test_1,23

2019-02-15,test_2,19

2019-02-16,test_2,19

CREATE TABLE user_age (
dt string,
user_id string, age INT ) 
ROW format delimited FIELDS TERMINATED BY ',';

1)按照日期以及用户分组,按照日期排序并给出排名

select
   dt,
   user_id,
   min(age) age,
   rank() over(partition by user_id order by dt) rk
from


   user_age
group by


   dt,user_id;t1

 

2)计算日期及排名的差值

select


   user_id,


   age,


date_sub(dt,rk) flag


from


   t1;t2

3)过滤出差值大于等于2的,即为连续两天活跃的用户

select


   user_id,


min(age) age


from


   t2


group by


   user_id,flag


having


count(*)>=2;t3

4)对数据进行去重处理(一个用户可以在两个不同的时间点连续登录),例如:a用户在1月10号1月11号以及1月20号和1月21号4天登录。

select


   user_id,


min(age) age


from


   t3


group by


   user_id;t4

 

5)计算活跃用户(两天连续有访问)的人数以及平均年龄

select


count(*) ct,


cast(sum(age)/count(*) as decimal(10,2))


from t4;

6)对全量数据集进行按照用户去重

select


   user_id,


min(age) age


from


   user_age


group by


   user_id;t5

7)计算所有用户的数量以及平均年龄

select


count(*) user_count,


cast((sum(age)/count(*)) as decimal(10,1))


from


   t5;

8)将第5步以及第7步两个数据集进行union all操作

select


0 user_total_count,


0 user_total_avg_age,


count(*) twice_count,


cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age


from


(


select


   user_id,


min(age) age


from


   (select


   user_id,


min(age) age


from


   (


select


   user_id,


   age,


date_sub(dt,rk) flag


from


   (


select


       dt,


       user_id,


min(age) age,


rank() over(partition by user_id order by dt) rk


from


       user_age


group by


       dt,user_id


    )t1


    )t2


group by


   user_id,flag


having


count(*)>=2)t3


group by


   user_id


)t4






union all






select


count(*) user_total_count,


cast((sum(age)/count(*)) as decimal(10,1)),


0 twice_count,


0 twice_count_avg_age


from


   (


select


         user_id,


min(age) age


from


         user_age


group by


         user_id


   )t5;t6

9)计算最终结果

select


sum(user_total_count),


sum(user_total_avg_age),


sum(twice_count),


sum(twice_count_avg_age)


from


(select


0 user_total_count,


0 user_total_avg_age,


count(*) twice_count,


cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age


from


(


select


   user_id,


min(age) age


from


   (select


   user_id,


min(age) age


from


   (


select


   user_id,


   age,


date_sub(dt,rk) flag


from


   (


select


       dt,


       user_id,


min(age) age,


rank() over(partition by user_id order by dt) rk


from


       user_age


group by


       dt,user_id


    )t1


    )t2


group by


   user_id,flag


having


count(*)>=2)t3


group by


   user_id


)t4






union all






select


count(*) user_total_count,


cast((sum(age)/count(*)) as decimal(10,1)),


0 twice_count,


0 twice_count_avg_age


from


   (


select


         user_id,


min(age) age


from


         user_age


group by


         user_id


   )t5)t6;

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值