有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)
日期 用户 年龄
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
CREATE TABLE user_age (
dt string,
user_id string, age INT )
ROW format delimited FIELDS TERMINATED BY ',';
1)按照日期以及用户分组,按照日期排序并给出排名
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id;t1
2)计算日期及排名的差值
select
user_id,
age,
date_sub(dt,rk) flag
from
t1;t2
3)过滤出差值大于等于2的,即为连续两天活跃的用户
select
user_id,
min(age) age
from
t2
group by
user_id,flag
having
count(*)>=2;t3
4)对数据进行去重处理(一个用户可以在两个不同的时间点连续登录),例如:a用户在1月10号1月11号以及1月20号和1月21号4天登录。
select
user_id,
min(age) age
from
t3
group by
user_id;t4
5)计算活跃用户(两天连续有访问)的人数以及平均年龄
select
count(*) ct,
cast(sum(age)/count(*) as decimal(10,2))
from t4;
6)对全量数据集进行按照用户去重
select
user_id,
min(age) age
from
user_age
group by
user_id;t5
7)计算所有用户的数量以及平均年龄
select
count(*) user_count,
cast((sum(age)/count(*)) as decimal(10,1))
from
t5;
8)将第5步以及第7步两个数据集进行union all操作
select
0 user_total_count,
0 user_total_avg_age,
count(*) twice_count,
cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age
from
(
select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt,rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id
)t1
)t2
group by
user_id,flag
having
count(*)>=2)t3
group by
user_id
)t4
union all
select
count(*) user_total_count,
cast((sum(age)/count(*)) as decimal(10,1)),
0 twice_count,
0 twice_count_avg_age
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id
)t5;t6
9)计算最终结果
select
sum(user_total_count),
sum(user_total_avg_age),
sum(twice_count),
sum(twice_count_avg_age)
from
(select
0 user_total_count,
0 user_total_avg_age,
count(*) twice_count,
cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age
from
(
select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt,rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id
)t1
)t2
group by
user_id,flag
having
count(*)>=2)t3
group by
user_id
)t4
union all
select
count(*) user_total_count,
cast((sum(age)/count(*)) as decimal(10,1)),
0 twice_count,
0 twice_count_avg_age
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id
)t5)t6;