建表语句
create table t_user_video_action_d(
user_id string,
video_id string,
action_type int,
`timestamp` bigint
)
partitioned by (ds int)
row format delimited
fields terminated by ',';
create table t_video_d(
video_id string,
video_type string,
video_user_id string,
video_create_time bigint,
video_description string
)
partitioned by (ds int)
row format delimited
fields terminated by ',';
create table t_video_user_d(
video_user_id string,
video_user_name string,
video_user_type string
)
partitioned by (ds int)
row format delimited
fields terminated by ',';
题目 1:输出一张搞笑品类表,用于分析作者近 3 条视频情况:输出每个视频创建者最近发布的 3 个视频,在过去一周内的曝光,点赞总数;需要产出字段:视频创建者id,曝光用户数,点赞用户数,曝光次数、点赞次数。
with t1 as (
-- 搞笑,,近一周
select user_id,video_user_id,action_type,row_number() over (partition by video_user_id order by video_create_time desc ) num from t_user_video_action_d uva,t_video_d v where uva.video_id = v.video_id and video_type = '搞笑' and video_create_time >unix_timestamp()+8*3600 -7*24*3600
)
-- 近三条
select video_user_id,
count(`if`(action_type = 1, user_id,0)) `曝光用户数`,
count(`if`(action_type = 1,1,null)) `曝光次数`,
count(`if`(action_type = 2, user_id,0)) `点赞用户数`,
count(`if`(action_type = 2,1,null)) `点赞次数`
from t1 where num <=3 group by video_user_id;
题目 2:输出一个取数结果:在 20221103 日创建的视频中,如果视频描述中带有 "搞笑段子”,“脱口秀”两个关键词即为“搞笑类内容”,不带有这两个关键词为“非搞笑类内容”,最终输出 20221103 日搞笑与非搞笑视频去重数量
select '搞笑视频' `视频类型`,count(distinct video_id) `数量` from t_video_d where ds='20221103' and video_description like "%搞笑段子%" and video_description like "%脱口秀%"
union
select '非搞笑视频',count(distinct video_id) count from t_video_d where ds='20221103' and video_description not like "%搞笑段子%" or video_description not like "%脱口秀%";
题目 3:计算每个用户每天第一次曝光视频的时间戳,运行速度越快越好,输出字段:日期,用户 id,时间戳
-- 曝光率 每个用户每天第一次观看的视频
select user_id,ds,min(`timestamp`) from t_user_video_action_d where action_type = 1 group by user_id,ds;
题目 4:查询过去任意日期的曝光活跃用户的 7 日留存率,输出字段:日期,用户 id,7日留存率
-- 七天留存率 七天前登录的用户,今天是否登录
with t1 as (
select ds,user_id from t_user_video_action_d where action_type = 1 group by ds, user_id
),
t2 as (
select t11.ds ds,count(t11.user_id) today,count(t12.user_id) agoday
from t1 t11 right join t1 t12 on from_unixtime(unix_timestamp(cast(t11.ds as string) ,'yyyyMMdd'),'yyyy-MM-dd') = date_sub(from_unixtime(unix_timestamp(cast(t12.ds as string),'yyyyMMdd'),'yyyy-MM-dd'),7)
and t11.user_id = t12.user_id group by t11.ds
)
select ds,today/agoday `留存率` from t2;