Hive hql 经典5道面试题

2 篇文章 0 订阅
2 篇文章 0 订阅

最近在深入了解Hive,尚硅谷的这5道题很经典,有引导意义,分步解题也很有用,故记录之,方便回看

1.连续问题

如下数据为蚂蚁森林中用户领取的减少碳排放量
找出连续 3 天及以上减少碳排放量在 100 以上的用户

id dt lowcarbon
1001 2021-12-12 123
1002 2021-12-12 45
1001 2021-12-13 43
1001 2021-12-13 45
1001 2021-12-13 23
1002 2021-12-14 45
1001 2021-12-14 230
1002 2021-12-15 45
1001 2021-12-15 23

建表

create table mayi
(
    id        int,
    dt        string,
    lowcarbon int
)
    row format delimited
        fields terminated by '\t';
load data local inpath "/home/hadoop/test/mayi.txt" into table mayi;
select *
from mayi;
--1)求出每日lowcarbon在100以上的用户记录
select id, dt, sum(lowcarbon) lowcarbon
from mayi
group by id, dt
having lowcarbon > 100;
-- t1
-- 2)等差数列法
-- 两个等差数据如果等差相同,则相同位置的数据相减得到的相同
-- 按照用户分组,同时按照时间排序,计算每条数据的Rank
select id,
       dt,
       lowcarbon,
       rank() over (partition by id order by dt) rk
from t1;
-- t2

-- 3)将每行数据中的日期间区Rank值
select id,
       dt,
       lowcarbon,
       date_sub(dt, rk) flag
from t2;
-- t3

-- 4) 按照用户及 Flag 分组,求每个组有多少条数据,并找出大于等于3条的数据
select id,
       flag,
       count(*) ct
from (select id,
             dt,
             lowcarbon,
             date_sub(dt, rk) flag
      from (select id,
                   dt,
                   lowcarbon,
                   rank() over (partition by id order by dt) rk
            from (select id, dt, sum(lowcarbon) lowcarbon
                  from mayi
                  group by id, dt
                  having lowcarbon > 100) t1) t2) t3
group by id, flag
having ct >= 3;

2.分组问题

如下为电商公司用户访问时间数据

id ts(秒)
1001 17523641234
1001 17523641256
1002 17523641278
1001 17523641334
1002 17523641434
1001 17523641534
1001 17523641544
1002 17523641634
1001 17523641638
1001 17523641654

某个用户连续的访问记录如果时间间隔小于 60 秒,则分为同一个组,结果为:

id ts(秒) group
1001 17523641234 1
1001 17523641256 1
1001 17523641334 2
1001 17523641534 3
1001 17523641544 3
1001 17523641638 4
1001 17523641654 4
1002 17523641278 1
1002 17523641434 2
1002 17523641634 3

create table elec
(
    id int,
    ts bigint
)
    row format delimited
        fields terminated by '\t';

load data local inpath "/home/hadoop/test/elec.txt" into table elec;

select *
from elec;
-- 1)将上一行时间数据下移
-- lead:领导
-- lag:延迟
select id,
       ts,
       lag(ts, 1, 0) over (partition by id order by ts) lagts
from elec;
-- t1

-- 2)将当前行时间数据减去上一行时间数据
select id,
       ts,
       ts - lagts tsdiff
from t1;
-- t2
select id,
       ts,
       ts - lagts tsdiff
from (select id,
             ts,
             lag(ts, 1, 0) over (partition by id order by ts) lagts
      from elec) t1;

-- 3)计算每个用户范围内从第一行到当前行tsdiff大于等于60的总个数(分组号)
select id,
       ts,
       sum(if(tsdiff >= 60, 1, 0)) over (partition by id order by ts) groupid
from (select id,
             ts,
             ts - lagts tsdiff
      from (select id,
                   ts,
                   lag(ts, 1, 0) over (partition by id order by ts) lagts
            from elec) t1) t2;

3.间隔连续问题

某游戏公司记录的用户每日登录数据
注意:有的时候hql过于复杂,需要spark任务也比较多,可能需要建立临时表分多步运行

id dt
1001 2021-12-12
1002 2021-12-12
1001 2021-12-13
1001 2021-12-14
1001 2021-12-16
1002 2021-12-16
1001 2021-12-19
1002 2021-12-17
1001 2021-12-20

计算每个用户最大的连续登录天数,可以间隔一天。
解释:如果一个用户在 1,3,5,6 登录游戏,则视为连续 6 天登录。

create table user_login
(
    id int,
    dt string
)
    row format delimited
        fields terminated by '\t';

load data local inpath "/home/hadoop/test/user_login.txt" into table user_login;

select *
from user_login;
-- 思路二:分组
-- 1001	2021-12-12
-- 1001	2021-12-13
-- 1001	2021-12-14
-- 1001	2021-12-16
-- 1001	2021-12-19
-- 1001	2021-12-20

-- 1)将上一行时间数据下移
-- 1001	2021-12-12	1970-01-01
-- 1001	2021-12-13	2021-12-12
-- 1001	2021-12-14	2021-12-13
-- 1001	2021-12-16	2021-12-14
-- 1001	2021-12-19	2021-12-16
-- 1001	2021-12-20	2021-12-19
select id,
       dt,
       lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
from user_login;
-- t1

-- 2)将当前行时间减去上一行时间数据(datediff(dt1,dt2))
-- 1001	2021-12-12	564564
-- 1001	2021-12-13	1
-- 1001	2021-12-14	1
-- 1001	2021-12-16	2
-- 1001	2021-12-19	3
-- 1001	2021-12-20	1
select id,
       dt,
       datediff(dt, lagdt) flag
from t1;
create table user_login_tmp as
select id,
       dt,
       datediff(dt, lagdt) flag
from (select id,
       dt,
       lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
from user_login)t1;
select * from user_login_tmp;
-- t2

-- 3)按照用户分组,同时按照时间排序,计算从第一行到当前行大于2的数据的总条数(sum(if(flag>2,1,0)))
-- 1001	2021-12-12	1
-- 1001	2021-12-13	1
-- 1001	2021-12-14	1
-- 1001	2021-12-16	1
-- 1001	2021-12-19	2
-- 1001	2021-12-20	2
select id,
       dt,
       sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag
from t2;

select id,
       dt,
       sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag2
from (select id,
       dt,
       datediff(dt, lagdt) flag
from (select id,
       dt,
       lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
from user_login)t1)t2;

select id,
       dt,
       sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag2
from user_login_tmp;
-- t3

-- 4)按照用户和flag分组,求最大时间减去最小时间并加上1
select id,
       flag,
       datediff(max(dt), min(dt)) days
from t3
group by id, flag;
-- t4

-- 5)取连续登录天数的最大值
select id,
       max(days) + 1
from t4
group by id;

-- 6)最终HQL
-- 6)最终HQL
-- set yarn.scheduler.maximum-allocation-mb=4;
-- set spark.executor.cores=4;
-- set  spark.executor.instances=6;
-- set  spark.executor.memory="3.2g";
-- set  spark.yarn.executor.memoryOverhead="0.8G";
-- set spark.executor.cores=1;
-- set  spark.executor.instances=24;
-- set  spark.default.parallelism=20;
-- set  spark.driver.memory="819MB";
-- set  spark.driver.memoryOverhead="205MB";

set spark.executor.instances = 20;
set  spark.executor.cores = 1;
set spark.executor.memory = 1g;
select id,
       max(days) + 1
from (select id,
             flag,
             datediff(max(dt), min(dt)) days
      from (select id,
                   dt,
                   sum(if(flag > 2, 1, 0)) over (partition by id order by dt) flag
            from (select id,
                         dt,
                         datediff(dt, lagdt) flag
                  from (select id,
                               dt,
                               lag(dt, 1, '1970-01-01') over (partition by id order by dt) lagdt
                        from user_login) t1) t2) t3
      group by id, flag) t4
group by id;
-- join配置
-- set hive.auto.convert.join=false;
--
-- set mapred.map.tasks.speculative.execution=true;
-- set mapred.reduce.tasks.speculative.execution=true;

4.打折日期交叉问题
如下为平台商品促销数据:字段为品牌,打折开始日期,打折结束日期

brand stt edt
oppo 2021-06-05 2021-06-09
oppo 2021-06-11 2021-06-21
vivo 2021-06-05 2021-06-15
vivo 2021-06-09 2021-06-21
redmi 2021-06-05 2021-06-21
redmi 2021-06-09 2021-06-15
redmi 2021-06-17 2021-06-26
huawei 2021-06-05 2021-06-26
huawei 2021-06-09 2021-06-15
huawei 2021-06-17 2021-06-21

– 计算每个品牌总的打折销售天数,注意其中的交叉日期,比如 vivo 品牌,第一次活动时间为 2021-06-05 到 2021-06-15,
– 第二次活动时间为 2021-06-09 到 2021-06-21 其中 9 号到 15号为重复天数,只统计一次,即 vivo 总打折天数为 2021-06-05 到 2021-06-21 共计 17 天。

drop table if exists  discount;
create table discount
(
    id string,
    stt string,
    edt string
)
    row format delimited
        fields terminated by '\t';

load data local inpath "/home/hadoop/test/discount.txt" into table discount;

select *
from discount;
-- id 		stt 		edt
-- oppo	2021-06-05	2021-06-09
-- oppo	2021-06-11	2021-06-21
--
-- vivo	2021-06-05	2021-06-15
-- vivo	2021-06-09	2021-06-21
--
-- redmi	2021-06-05	2021-06-21
-- redmi	2021-06-09	2021-06-15
-- redmi	2021-06-17	2021-06-26
--
-- huawei	2021-06-05	2021-06-26
-- huawei	2021-06-09	2021-06-15
-- huawei	2021-06-17	2021-06-21

-- select * from business;
-- select name,orderdate,cost,
-- sum(cost) over() as sample1
--        ,--所有行相加
-- sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4
-- ,--和 sample3 一样,由起点到当前行的聚合
-- sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6
-- --        ,--当前行和前边一行及后面一行
-- from business

-- 1)将当前行以前的数据中最大的edt放置当前行


select
    id,
    stt,
    edt,
    max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from discount;
-- t1

-- redmi	2021-06-05	2021-06-21	null
-- redmi	2021-06-09	2021-06-15	2021-06-21
-- redmi	2021-06-17	2021-06-26	2021-06-21

-- 2)比较开始时间与移动下来的数据,如果开始时间大,则不需要操作,
-- 反之则需要将移动下来的数据 加一 替换当前行的开始时间
-- 如果是第一行数据,maxEDT为null,则不需要操作
select
    id,
    if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
    edt
from t1;
-- t2
-- redmi	2021-06-05	2021-06-21
-- redmi	2021-06-22	2021-06-15
-- redmi	2021-06-22	2021-06-26

-- 3)将每行数据中的结束日期减去开始日期
select
    id,
    datediff(edt,stt) days
from
    t2;
-- t3
-- redmi	16
-- redmi	-7
-- redmi	4


-- 4)按照品牌分组,计算每条数据加一的总和
select
    id,
    sum(if(days>=0,days+1,0)) days
from
    t3
group by id;
-- redmi	22

-- 5)最终HQL
select
    id,
    sum(if(days>=0,days+1,0)) days
from
    (select
    id,
    datediff(edt,stt) days
from
    (select
    id,
    if(maxEdt is null,stt,if(stt>maxEdt,stt,date_add(maxEdt,1))) stt,
    edt
from
    (select
    id,
    stt,
    edt,
    max(edt) over(partition by id order by stt rows between UNBOUNDED PRECEDING and 1 PRECEDING) maxEdt
from discount)t1)t2)t3
group by id;

5.同时在线问题

如下为某直播平台主播开播及关播时间,根据该数据计算出平台最高峰同时在线的主播人数。

id stt edt
1003 2021-06-14 12:12:12 2021-06-14 18:12:12
1004 2021-06-14 13:12:12 2021-06-14 16:12:12
1002 2021-06-14 15:12:12 2021-06-14 16:12:12
1005 2021-06-14 15:18:12 2021-06-14 20:12:12
1001 2021-06-14 20:12:12 2021-06-14 23:12:12
1006 2021-06-14 21:12:12 2021-06-14 23:15:12
1007 2021-06-14 22:12:12 2021-06-14 23:10:12

drop table if exists  living;
create table living
(
    id string,
    stt string,
    edt string
)
    row format delimited
        fields terminated by '\t';

load data local inpath "/home/hadoop/test/living.txt" into table living;

select *
from living;
-- 流式!

-- 1)对数据分类,在开始数据后添加正1,表示有主播上线,同时在关播数据后添加-1,表示有主播下
select id,stt dt,1 p from living
union
select id,edt dt,-1 p from living;
-- t1
-- 1001	2021-06-14 12:12:12	1
-- 1001	2021-06-14 18:12:12	-1
-- 1001	2021-06-14 20:12:12	1
-- 1001	2021-06-14 23:12:12	-1
-- 1002	2021-06-14 15:12:12	1
-- 1002	2021-06-14 16:12:12	-1
-- 1003	2021-06-14 13:12:12	1
-- 1003	2021-06-14 16:12:12	-1
-- 1004	2021-06-14 13:15:12	1
-- 1004	2021-06-14 20:12:12	-1
-- 1005	2021-06-14 15:18:12	1
-- 1005	2021-06-14 20:12:12	-1
-- 1006	2021-06-14 21:12:12	1
-- 1006	2021-06-14 23:15:12	-1
-- 1007	2021-06-14 22:12:12	1
-- 1007	2021-06-14 23:10:12	-1

-- 2)按照时间排序,计算累加人数
select
    id,
    dt,
    sum(p) over(order by dt) sum_p
from
    (select id,stt dt,1 p from living
union
select id,edt dt,-1 p from living)t1;
-- t2



-- 3)找出同时在线人数最大值
select
     max(sum_p)
  --  *
from
    (select
    id,
    dt,
    sum(p) over(order by dt) sum_p
from
    (select id,stt dt,1 p from living
union
select id,edt dt,-1 p from living)t1)t2
where dt='2021-06-14 20:12:12';
  • 9
    点赞
  • 43
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值