最近在深入了解Hive,尚硅谷的这5道题很经典,有引导意义,分步解题也很有用,故记录之,方便回看
1.连续问题
如下数据为蚂蚁森林中用户领取的减少碳排放量
找出连续 3 天及以上减少碳排放量在 100 以上的用户
id dt lowcarbon
1001 2021-12-12 123
1002 2021-12-12 45
1001 2021-12-13 43
1001 2021-12-13 45
1001 2021-12-13 23
1002 2021-12-14 45
1001 2021-12-14 230
1002 2021-12-15 45
1001 2021-12-15 23
建表
create table mayi
(
id int,
dt string,
lowcarbon int
)
row format delimited
fields terminated by '\t';
load data local inpath "/home/hadoop/test/mayi.txt" into table mayi;
select *
from mayi;
--1)求出每日lowcarbon在100以上的用户记录
select id, dt, sum(lowcarbon) lowcarbon
from mayi
group by id, dt
having lowcarbon > 100;
-- t1
-- 2)等差数列法
-- 两个等差数据如果等差相同,则相同位置的数据相减得到的相同
-- 按照用户分组,同时按照时间排序,计算每条数据的Rank
select id,
dt,
lowcarbon,
rank() over (partition by id order by dt) rk
from t1;
-- t2
-- 3)将每行数据中的日期间区Rank值
select id,
dt,
lowcarbon,
date_sub(dt, rk) flag
from t2;
-- t3
-- 4) 按照用户及 Flag 分组,求每个组有多少条数据,并找出大于等于3条的数据
select id,
flag,
count(*) ct
from (select id,
dt,
lowcarbon,
date_sub(dt, rk) flag
from (select id,
dt,
lowcarbon,
rank() over (partition by id order by dt) rk
from (select id, dt, sum(lowcarbon) lowcarbon
from mayi
group by id, dt
having lowcarbon > 100) t1) t2) t3
group by id, flag
having ct >= 3;
2.分组问题
如下为电商公司用户访问时间数据
id ts(秒)
1001 17523641234
1001 17523641256
1002 17523641278
1001 17523641334
1002 17523641434
1001 17523641534
1001 17523641544
1002 17523641634
1001 17523641638
1001 17523641654
某个用户连续的访问记录如果时间间隔小于 60 秒,则分为同一个组,结果为:
id ts(秒) group
1001 17523641234 1
1001 17523641256 1
1001 17523641334 2
1001 17523641534 3
1001 17523641544 3
1001 17523641638 4
1001 17523641654 4
1002 17523641278 1
1002 17523641434 2
1002 17523641634 3
create table elec
(
id int,
ts bigint
)
row format delimited
fields terminated by '\t';
load data local inpath "/home/hadoop/test/elec.txt" into table elec;
select *
from elec;
-- 1)将上一行时间数据下移
-- lead:领导
-- lag:延迟
select id,
ts,
lag(ts, 1, 0) over (partition by id order by ts) lagts
from elec;
-- t1
-- 2)将当前行时间数据减去上一行时间数据
select id,
ts,
ts - lagts tsdiff
from t1;
-- t2
select id,
ts,
ts - lagts tsdiff
from (select id,
ts,
lag(ts, 1, 0) over (partition by id order by ts) lagts
from elec) t1;
-- 3)计算每个用户范围内从第一行到当前行tsdiff大于等于60的总个数(分组号)
select id,
ts,
sum(if(tsdiff >= 60, 1, 0)) over (partition by id order by ts) groupid
from (select id,
ts,
ts - lagts tsdiff
from (select id,
ts,
lag(ts, 1, 0) over (partition by id order by ts) lagts
from elec) t1) t2;
3.间隔连续问题
某游戏公司记录的用户每日登录数据
注意:有的时候hql过于复杂,需要spark任务也比较多,可能需要建立临时表分多步运行
id dt
1001 2021-12-12
1002 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1002 2021-12-16
1001 2021-12-19
1002 2021-12-17
1001 2021-12-20
计算每个用户最大的连续登录天数,可以间隔一天。
解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。
create table user_login
(
id int,
dt string
)
row format delimited
fields terminated by '\t';
load data local inpath "/home/hadoop/test/user_login.txt" into table user_login;
select *
from user_login;
-- 思路二:分组
-- 1001 2021-12-12
-- 1001 2021-12-13
-- 1001 2021-12-14
-- 1001 2021-12-16
-- 1001 2021-12-19
-- 1001 2021-12-20
-- 1)将上一行时间数据下移
-- 1001 2021-12-12 1970-01-01
-- 1001 2021-12-13 2021-12-12
-- 1001 2021-12-14 2021-12-13
-- 1001 2021-12-16 2021-12-14
-- 1001 2021-12-19 2021-12-16
-- 1001 2021-12-20 2021-12-19
select id,
dt,
lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
from user_login;
-- t1
-- 2)将当前行时间减去上一行时间数据(datediff(dt1,dt2))
-- 1001 2021-12-12 564564
-- 1001 2021-12-13 1
-- 1001 2021-12-14 1
-- 1001 2021-12-16 2
-- 1001 2021-12-19 3
-- 1001 2021-12-20 1
select id,
dt,
datediff(dt, lagdt) flag
from t1;
create table user_login_tmp as
select id,
dt,
datediff(dt, lagdt) flag
from (select id,
dt,
lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
from user_login)t1;
select * from user_login_tmp;
-- t2
-- 3)按照用户分组,同时按照时间排序,计算从第一行到当前行大于2的数据的总条数(sum(if(flag>2,1,0)))
-- 1001 2021-12-12 1
-- 1001 2021-12-13 1
-- 1001 2021-12-14 1
-- 1001 2021-12-16 1
-- 1001 2021-12-19 2
-- 1001 2021-12-20 2
select id,
dt,
sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag
from t2;
select id,
dt,
sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag2
from (select id,
dt,
datediff(dt, lagdt) flag
from (select id,
dt,
lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
from user_login)t1)t2;
select id,
dt,
sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag2
from user_login_tmp;
-- t3
-- 4)按照用户和flag分组,求最大时间减去最小时间并加上1
select id,
flag,
datediff(max(dt), min(dt)) days
from t3
group by id, flag;
-- t4
-- 5)取连续登录天数的最大值
select id,
max(days) + 1
from t4
group by id;
-- 6)最终HQL
-- 6)最终HQL
-- set yarn.scheduler.maximum-allocation-mb=4;
-- set spark.executor.cores=4;
-- set spark.executor.instances=6;
-- set spark.executor.memory="3.2g";
-- set spark.yarn.executor.memoryOverhead="0.8G";
-- set spark.executor.cores=1;
-- set spark.executor.instances=24;
-- set spark.default.parallelism=20;
-- set spark.driver.memory="819MB";
-- set spark.driver.memoryOverhead="205MB";
set spark.executor.instances = 20;
set spark.executor.cores = 1;
set spark.executor.memory = 1g;
select id,
max(days) + 1
from (select id,
flag,
datediff(max(dt), min(dt)) days
from (select id,
dt,
sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag
from (select id,
dt,
datediff(dt, lagdt) flag
from (select id,
dt,
lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
from user_login) t1) t2) t3
group by id, flag) t4
group by id;
-- join配置
-- set hive.auto.convert.join=false;
--
-- set mapred.map.tasks.speculative.execution=true;
-- set mapred.reduce.tasks.speculative.execution=true;
4.打折日期交叉问题
如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期
brand stt edt
oppo 2021-06-05 2021-06-09
oppo 2021-06-11 2021-06-21
vivo 2021-06-05 2021-06-15
vivo 2021-06-09 2021-06-21
redmi 2021-06-05 2021-06-21
redmi 2021-06-09 2021-06-15
redmi 2021-06-17 2021-06-26
huawei 2021-06-05 2021-06-26
huawei 2021-06-09 2021-06-15
huawei 2021-06-17 2021-06-21
– 计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-05 到 2021-06-15,
– 第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。
drop table if exists discount;
create table discount
(
id string,
stt string,
edt string
)
row format delimited
fields terminated by '\t';
load data local inpath "/home/hadoop/test/discount.txt" into table discount;
select *
from discount;
-- id stt edt
-- oppo 2021-06-05 2021-06-09
-- oppo 2021-06-11 2021-06-21
--
-- vivo 2021-06-05 2021-06-15
-- vivo 2021-06-09 2021-06-21
--
-- redmi 2021-06-05 2021-06-21
-- redmi 2021-06-09 2021-06-15
-- redmi 2021-06-17 2021-06-26
--
-- huawei 2021-06-05 2021-06-26
-- huawei 2021-06-09 2021-06-15
-- huawei 2021-06-17 2021-06-21
-- select * from business;
-- select name,orderdate,cost,
-- sum(cost) over() as sample1
-- ,--所有行相加
-- sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4
-- ,--和 sample3 一样,由起点到当前行的聚合
-- sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6
-- -- ,--当前行和前边一行及后面一行
-- from business
-- 1)将当前行以前的数据中最大的edt放置当前行
select
id,
stt,
edt,
max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from discount;
-- t1
-- redmi 2021-06-05 2021-06-21 null
-- redmi 2021-06-09 2021-06-15 2021-06-21
-- redmi 2021-06-17 2021-06-26 2021-06-21
-- 2)比较开始时间与移动下来的数据,如果开始时间大,则不需要操作,
-- 反之则需要将移动下来的数据 加一 替换当前行的开始时间
-- 如果是第一行数据,maxEDT为null,则不需要操作
select
id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
edt
from t1;
-- t2
-- redmi 2021-06-05 2021-06-21
-- redmi 2021-06-22 2021-06-15
-- redmi 2021-06-22 2021-06-26
-- 3)将每行数据中的结束日期减去开始日期
select
id,
datediff(edt,stt) days
from
t2;
-- t3
-- redmi 16
-- redmi -7
-- redmi 4
-- 4)按照品牌分组,计算每条数据加一的总和
select
id,
sum(if(days>=0,days+1,0)) days
from
t3
group by id;
-- redmi 22
-- 5)最终HQL
select
id,
sum(if(days>=0,days+1,0)) days
from
(select
id,
datediff(edt,stt) days
from
(select
id,
if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
edt
from
(select
id,
stt,
edt,
max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from discount)t1)t2)t3
group by id;
5.同时在线问题
如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数。
id stt edt
1003 2021-06-14 12:12:12 2021-06-14 18:12:12
1004 2021-06-14 13:12:12 2021-06-14 16:12:12
1002 2021-06-14 15:12:12 2021-06-14 16:12:12
1005 2021-06-14 15:18:12 2021-06-14 20:12:12
1001 2021-06-14 20:12:12 2021-06-14 23:12:12
1006 2021-06-14 21:12:12 2021-06-14 23:15:12
1007 2021-06-14 22:12:12 2021-06-14 23:10:12
drop table if exists living;
create table living
(
id string,
stt string,
edt string
)
row format delimited
fields terminated by '\t';
load data local inpath "/home/hadoop/test/living.txt" into table living;
select *
from living;
-- 流式!
-- 1)对数据分类,在开始数据后添加正1,表示有主播上线,同时在关播数据后添加-1,表示有主播下
select id,stt dt,1 p from living
union
select id,edt dt,-1 p from living;
-- t1
-- 1001 2021-06-14 12:12:12 1
-- 1001 2021-06-14 18:12:12 -1
-- 1001 2021-06-14 20:12:12 1
-- 1001 2021-06-14 23:12:12 -1
-- 1002 2021-06-14 15:12:12 1
-- 1002 2021-06-14 16:12:12 -1
-- 1003 2021-06-14 13:12:12 1
-- 1003 2021-06-14 16:12:12 -1
-- 1004 2021-06-14 13:15:12 1
-- 1004 2021-06-14 20:12:12 -1
-- 1005 2021-06-14 15:18:12 1
-- 1005 2021-06-14 20:12:12 -1
-- 1006 2021-06-14 21:12:12 1
-- 1006 2021-06-14 23:15:12 -1
-- 1007 2021-06-14 22:12:12 1
-- 1007 2021-06-14 23:10:12 -1
-- 2)按照时间排序,计算累加人数
select
id,
dt,
sum(p) over(order by dt) sum_p
from
(select id,stt dt,1 p from living
union
select id,edt dt,-1 p from living)t1;
-- t2
-- 3)找出同时在线人数最大值
select
max(sum_p)
-- *
from
(select
id,
dt,
sum(p) over(order by dt) sum_p
from
(select id,stt dt,1 p from living
union
select id,edt dt,-1 p from living)t1)t2
where dt='2021-06-14 20:12:12';