Hive面试-情景题总结【包含:建表脚本、数据导入脚本、模拟数据】

前言:

Hive面试除了理论知识外
情景题也占据了很大的比重
总结一下有代表性的几道情景题,可以根据这些情景题来推演其他相似的题型
本锦集并无统一标准答案,答案供大家参考,如有更好参考答案,可整理、讨

需要先将模拟数据放入指定路径下这里以 /hivetest 为准
模拟数据:https://pan.baidu.com/s/1DwuJTe3hMR4lz5dx2aKjfQ
密码:uikc

建表语句

#!/usr/bin/env bash

# 定义变量名字
hive_database="hivetest"
hive="hive"

# 创建数据库

init_db_sql="\
create database $hive_database;
"
$hive -e "$init_db_sql"

if [ $? -eq 0 ];then
    echo "============<创建数据库hivetest成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建数据库hivetest失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t2 建表
init_t2_sql="\
use $hive_database;
create table t2(
uid int,
channl int,
min int
)row format delimited fields terminated by ' ';
"
$hive -e "$init_t2_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t2成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t2失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t3 建表

init_t3_sql="\
use $hive_database;
create table t3(
userid string,
month string,
visits int
)row format delimited fields terminated by ',';
"
$hive -e "$init_t3_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t3成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t3失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t4 建表
init_t4_sql="\
use $hive_database;
create table if not exists t4(
uid int,
dt string,
login_status int
)row format delimited fields terminated by ' ';
"
$hive -e "$init_t4_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t4成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t4失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t6 建表
init_t6_sql="\
use $hive_database;
create table if not exists t6(
stu_no int,
class int,
score int
)row format delimited fields terminated by ' ';
"
$hive -e "$init_t6_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t6成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t6失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t8 建表
init_t8_sql="\
use $hive_database;
create table t8( id int, userid string, subject string, score double )row format delimited fields terminated by ' ' ;
"
$hive -e "$init_t8_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t8成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t8失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t9 建表
init_t9_sql="\
use $hive_database;
create table t9(uid int,tags array<int>)row format 
delimited fields terminated by ' ' 
collection items terminated by ',';
"
$hive -e "$init_t9_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t9成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t9失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t101 建表
init_t101_sql="\
use $hive_database;
create table t101(
tags string
);
"
$hive -e "$init_t101_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t101成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t101失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

init_t102_sql="\
use $hive_database;
create table t102(
id int,
lab string
)row format delimited fields terminated by ' ';
"
$hive -e "$init_t102_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t102成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t102失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t11 建表
init_t11_sql="\
use $hive_database;
create table t11( id string, tag string, flag int )row format 
delimited fields terminated by ' ' ;
"
$hive -e "$init_t11_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t11成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t11失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t12 建表
init_t12_sql="\
use $hive_database;
create table t12(
uid string,
name string,
tags string
)
row format delimited
fields terminated by '\t';
"
$hive -e "$init_t12_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t12成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t12失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t13 建表
init_t13_sql="\
use $hive_database;
create table t13(
uid int, 
contents string 
)
row format delimited fields terminated by ' ';
"
$hive -e "$init_t13_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t13成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t13失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t14 建表
init_t14_sql="\
use $hive_database;
create table t14( id int, course string )
row format delimited fields terminated by ',' ;
"
$hive -e "$init_t14_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t14成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t14失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t16 建表
init_t16_sql="\
use $hive_database;
create table t16(dt bigint);
"
$hive -e "$init_t16_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t16成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t16失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t17 建表
init_t17_sql="\
use $hive_database;
create table t17(
sid String,
smh string,
money int
)row format delimited fields terminated by ',';
"
$hive -e "$init_t17_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t17成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t17失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi
	
# t20 建表
init_t20_sql="\
use $hive_database;
create table t20(
log_time String,
uid int
)row format delimited fields terminated by '\t';
"
$hive -e "$init_t20_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t20成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t20失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

# t22 建表		
init_t221_sql="\
use $hive_database;
create table t221(
id int,
name string
)row format delimited fields terminated by ' ';
"
$hive -e "$init_t221_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t221成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t221失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

init_t222_sql="\
use $hive_database;
create table t222(
id int,
name string
)row format delimited fields terminated by ' ';
"
$hive -e "$init_t222_sql"

if [ $? -eq 0 ];then
    echo "============<创建表t222成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<创建表t222失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

load脚本

#!/usr/bin/env bash

init_load_sql="\
use hivetest;
load data local inpath '/hivetest/t2.txt' overwrite into  table t2;
load data local inpath '/hivetest/t3.txt' overwrite into  table t3;
load data local inpath '/hivetest/t4.txt' overwrite into  table t4;
load data local inpath '/hivetest/t6.txt' overwrite into  table t6;
load data local inpath '/hivetest/t8.txt' overwrite into  table t8;
load data local inpath '/hivetest/t9.txt' overwrite into  table t9;
load data local inpath '/hivetest/t101.txt' overwrite into  table t101;
load data local inpath '/hivetest/t101.txt' overwrite into  table t102;
load data local inpath '/hivetest/t11.txt' overwrite into  table t11;
load data local inpath '/hivetest/t12.txt' overwrite into  table t12;
load data local inpath '/hivetest/t13.txt' overwrite into  table t13;
load data local inpath '/hivetest/t14.txt' overwrite into  table t14;
load data local inpath '/hivetest/t16.txt' overwrite into  table t16;
load data local inpath '/hivetest/t17.txt' overwrite into  table t17;
load data local inpath '/hivetest/t20.txt' overwrite into  table t20;
load data local inpath '/hivetest/t221.txt' overwrite into  table t221;
load data local inpath '/hivetest/t222.txt' overwrite into  table t222;
"
hive -e "$init_load_sql"

if [ $? -eq 0 ];then
    echo "============<导入数据成功,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=========="
else
    echo "============<导入数据失败,时间为[`date "+%Y-%m-%d %H:%M:%S"`]~>=============="
    exit 1
fi

常见sql情景题

2、求出每个栏目的被观看次数及累计观看时长

数据: vedio表
Uid channl min
1 1 23
2 1 12
3 1 12
4 1 32
5 1 342
6 2 13
7 2 34
8 2 13
9 2 134

参考答案

此题考察的是聚合函数的使用

首先根据题目来判断要以 channl 字段作为分类对象(得出结论 group by channl)

然后根据题目得知需要求出观看次数count(1) 观看时长 sum(min)

select channl,count(1),sum(min)
from
t2
group by channl

3、编写sql实现每个用户截止到每月为止的最大单月访问次数和累计到该月的总访问次数

数据: userid,month,visits
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
A,2015-02,4
A,2015-02,6
B,2015-02,10
B,2015-02,5
A,2015-03,16
A,2015-03,22
B,2015-03,23
B,2015-03,10
B,2015-03,1

实现效果
在这里插入图片描述
参考答案

这样的sql需要由内往外分析

1)先由实现效果和题目得出基础分类为用户userid 和 月份month(得出结论 group by userid,month),顺便查询出总每个用户每月的访问次数 sum(visits)

2)在第一步的基础上利用开窗函数查询最大访问次数和总访问次数即可

select 
userid `用户`,month `月份`, -- 2)在第一步的基础上利用开窗函数查询最大访问次数和总访问次数即可
max(a) over(partition by userid order by month) `最大访问次数`,
sum(a) over(partition by userid order by month) `总访问次数`,
a `当月访问次数` 
from
(
select  --  1)先由实现效果和题目得出基础分类为用户(userid)和月份(month)
userid,month,sum(visits) a
from 
t3
group by userid,month
) m;

4、编写连续7天登录的总人数:

数据: t4表
Uid dt login_status(1登录成功,0异常)
1 2019-07-11 1
1 2019-07-12 1
1 2019-07-13 1
1 2019-07-14 1
1 2019-07-15 1
1 2019-07-16 1
1 2019-07-17 1
1 2019-07-18 1
2 2019-07-11 1
2 2019-07-12 1
2 2019-07-13 0
2 2019-07-14 1
2 2019-07-15 1
2 2019-07-16 0
2 2019-07-17 1
2 2019-07-18 0
3 2019-07-11 1
3 2019-07-12 1
3 2019-07-13 1
3 2019-07-14 1
3 2019-07-15 1
3 2019-07-16 1
3 2019-07-17 1
3 2019-07-18 1

参考答案

select uid,c
from
(
select uid,ndt,count(1) as c
from
(
select uid,date_sub(dt,cast(rn as int)) as ndt
from
(
select uid,dt,
row_number() over(distribute by uid sort by dt) as rn
from
(
select uid,dt
from t4
where login_status=1
) t41
) t42
) t43
group by uid,ndt
) t44
group by uid,c
having c>=7
;

6、编写sql语句实现每班前三名,分数一样并列,同时求出前三名按名次排序的一次的分差:
数据如下:

stu表
Stu_no class score
1 1901 90
2 1901 90
3 1901 83
4 1901 60
5 1902 66
6 1902 23
7 1902 99
8 1902 67
9 1902 87

编写sql实现,得到结果如下

结果数据: 班级
stu_no score rn rn1 rn_diff
1901 1 90 1 1 90
1901 2 90 2 1 0
1901 3 83 3 1 -7
1902 7 99 1 1 99
1902 9 87 2 2 -12
1902 8 67 3 3 -20

参考答案

select class,stu_no,score,rn,rn_diff
from
(
select *,row_number() over(partition by class order by score desc) rn,
lag(score,1,score) over(partition by class order by score desc) lg,
score-lag(score,1,0)over(distribute by class sort by score desc) rn_diff
from
stu
) a
where rn<4;

8、编写sql实现行列互换:

数据如下
在这里插入图片描述
编写sql实现,得到结果如下
在这里插入图片描述
参考答案

select 
case when userid is null then "total" else userid end as userid,
sum(case when subject='yuwen' then score else 0 end) as `语文`,
sum(case when subject='shuxue' then score else 0 end) as `数学`,
sum(case when subject='yingyu' then score else 0 end) as `英语`,
sum(case when subject='zhengzi' then score else 0 end) as `政治`,
sum(score) total
from t8
group by userid with rollup
order by userid;

9、编写sql实现如下:

数据: t1表
uid tags
1 1,2,3
2 2,3
3 1,2

编写sql实现如下结果:

uid tag
1 1
1 2
1 3
2 2
2 3
3 1
3 2

参考答案

select uid from tt1 lateral view explode(tags) tt as tag;

10、行转列

数据:

T1表:
Tags
1,2,3
1,2
2,3

T2表:
Id lab
1 A
2 B
3 C

根据T1和T2表的数据,编写sql实现如下结果:

ids tags
1,2,3 A,B,C
1,2 A,B
2,3 B,C

参考答案

select tags,concat_ws(",",collect_list(lab))
from (
select tags,lab
from t102 
join
(select tags,tag
from
t101 lateral view explode(split(tags,",")) tt1 as tag
) abc
on t102.id=abc.tag) bb
group by tags
order by tags desc;  

11、行转列

数据:

t1表:
id tag flag
a b 2
a b 1
a b 3
c d 6
c d 8
c d 8

编写sql实现如下结果:

id tag flag
a b 1|2|3
c d 6|8

参考答案

select id,tag,concat_ws('|',collect_set(cast(flag as string))) as flag
from
(
select id,tag,flag from t11 group by id,tag,flag
order by flag
) aa
group by id,tag;

12、列转行

数据:

t1表
uid name tags
1 goudan chihuo,huaci
2 mazi sleep
3 laotie paly

编写sql实现如下结果:

uid name tag
1 goudan chihuo
1 goudan huaci
2 mazi sleep
3 laotie paly

参考答案

select
uid,
name,
tag
from t12
lateral view explode(split(tags,",")) t1 as tag;

13、行转列
数据:

t1表:
uid contents
1 i|love|china
2 china|is|good|i|i|like

统计结果如下,如果出现次数一样,则按照content名称排序:

content cnt
i 3
china 2
good 1
like 1
love 1
is 1

14、列转行
数据:

t1表
id course
1,a
1,b
1,c
1,e
2,a
2,c
2,d
2,f
3,a
3,b
3,c
3,e

根据编写sql,得到结果如下(表中的1表示选修,表中的0表示未选修):

id a b c d e f
1 1 1 1 0 1 0
2 1 0 1 1 0 1
3 1 1 1 0 1 0

参考答案

select id,
sum(case when course='a' then 1 else 0 end) as `a`,
sum(case when course='b' then 1 else 0 end) as `b`,
sum(case when course='c' then 1 else 0 end) as `c`,
sum(case when course='d' then 1 else 0 end) as `d`,
sum(case when course='e' then 1 else 0 end) as `e`,
sum(case when course='f' then 1 else 0 end) as `f`
from t14
group by id;

16、时间格式转换:yyyyMMdd -> yyyy-MM-dd
数据:

t1表
20190730
20190731

编写sql实现如下的结果:

2019-07-30
2019-07-31

参考答案

select
from_unixtime(unix_timestamp(cast(dt as string),'yyyyMMdd'),"yyyy-MM-dd")
from t16;

17、编写Hive的HQL语句求出每个店铺的当月销售额和累计到当月的总销售额?
数据:

店铺,月份,金额
a,01,150
a,01,200
b,01,1000
b,01,800
c,01,250
c,01,220
b,01,6000
a,02,2000
a,02,3000
b,02,1000
b,02,1500
c,02,350
c,02,280
a,03,350
a,03,250

参考答案

select 
case when sid is null then "total" else sid end
,sum(money) from t17
group by sid with rollup;	

20、以下为一个用户连续登录的记录表,当月每次登录都会产生一条记录。计算出每个用户本月最大连续登录天枢天数

2018-10-01 123
2018-10-02 123
2018-10-02 456
2018-10-04 123
2018-10-04 456
2018-10-05 123
2018-10-06 123

参考答案

select uid,max(ct) from
(
select uid,count(ds) ct from
(
select uid,date_sub(log_time,rm) ds from(
select uid,log_time,row_number() over(partition by uid order by log_time) rm
from t20) a
) dd
group by uid,ds
) mx
group by uid;

22、使用hive求出两个数据集的差集?
数据

t1表:
id name
1 zs
2 ls

t2表:
id name
1 zs
3 ww

结果如下:

id name
2 ls
3 ww

参考答案

select t1.id as id, t1.name as name 
from t221 t1 
left join t222 t2 on t1.id=t2.id 
where t2.id is null union 
select t2.id as id, t2.name as name 
from t221 t1 
right join t222 t2 on t1.id=t2.id 
where t1.id is null ;

有收获?希望烙铁们来个三连击,让更多的同学看到这篇文章

1、烙铁们,关注我看完保证有所收获,不信你打我。

2、点个赞呗,可以让更多的人看到这篇文章,后续还会有很哇塞的产出。

本文章仅供学习及个人复习使用,如需转载请标明转载出处,如有错漏欢迎指出
务必注明来源(注明: 来源:csdn , 作者:-马什么梅-)

  • 9
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 12
    评论
评论 12
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

一马什么梅一

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值