1、SQL面试题
1.1、连续7天登录的用户
-- 数据。uid dt status(1 正常登录,0 异常)
1 2019-07-11 1
1 2019-07-12 1
1 2019-07-13 1
1 2019-07-14 1
1 2019-07-15 1
1 2019-07-16 1
1 2019-07-17 1
1 2019-07-18 1
2 2019-07-11 1
2 2019-07-12 1
2 2019-07-13 0
2 2019-07-14 1
2 2019-07-15 1
2 2019-07-16 0
2 2019-07-17 1
2 2019-07-18 0
3 2019-07-11 1
3 2019-07-12 1
3 2019-07-13 1
3 2019-07-14 0
3 2019-07-15 1
3 2019-07-16 1
3 2019-07-17 1
3 2019-07-18 1
-- 建表语句
create table ulogin(
uid int,
dt date,
status int
)
row format delimited fields terminated by ' ';
-- 加载数据
load data local inpath '/home/hadoop/data/ulogin.dat' into table ulogin;
-- 连续值的求解,面试中常见的问题。这也是同一类,基本都可按照以下思路进行
-- 1、使用 row_number 在组内给数据编号(rownum)
-- 2、某个值 - rownum = gid,得到结果可以作为后面分组计算的依据
-- 3、根据求得的gid,作为分组条件,求最终结果
select uid, dt,
row_number() over (partition by uid order by dt) rownum
from ulogin
where status = 1;
select uid,
date_sub(dt, row_number() over (partition by uid order by dt)) gid
from ulogin
where status = 1;
select uid, count(*) countlogin
from (select uid,
date_sub(dt, row_number() over (partition by uid order by dt)) gid
from ulogin
where status = 1) tmp
group by uid, gid
having count(*) >= 7;
1.2、编写sql语句实现每班前三名,分数一样并列,同时求出前三名按名次排序的分差
-- 数据。sid class score
1 1901 90
2 1901 90
3 1901 83
4 1901 60
5 1902 66
6 1902 23
7 1902 99
8 1902 67
9 1902 87
-- 待求结果数据如下:
class score rank lagscore
1901 90 1 0
1901 90 1 0
1901 83 2 -7
1901 60 3 -23
1902 99 1 0
1902 87 2 -12
1902 67 3 -20
-- 建表语句
create table stu(
sno int,
class string,
score int
)row format delimited fields terminated by ' ';
-- 加载数据
load data local inpath '/home/hadoop/data/stu.dat' into table stu;
-- 求解思路:
-- 1、上排名函数,分数一样并列,所以用dense_rank
-- 2、将上一行数据下移,相减即得到分数差
-- 3、处理 NULL
select sno class,score,
dense_rank() over (partition by class order by score desc) as rank
from stu;
select sno class,score,
dense_rank() over (partition by class order by score desc) as rank,
score - lag(score) over (partition by class order by score desc) as lagscore
from stu;
select sno class,score,
dense_rank() over (partition by class order by score desc) as rank,
nvl(score - lag(score) over (partition by class order by score desc), 0) as lagscore
from stu;
1.3、行 <=> 列
行转列
-- 数据。id1 id2 flag
a b 2
a b 1
a b 3
c d 6
c d 8
c d 8
-- 编写sql实现如下结果
id1 id2 flag
a b 2|1|3
c d 6|8
-- 创建表 & 加载数据
create table row2line(
id string,
tag string,
flag int
) row format delimited fields terminated by ' ';
load data local inpath '/root/data/data2.dat' into table row2line;
-- 解题思路
-- 1、分组将数据聚拢,函数有collect_set、collect_set、 sort_array
select id , tag , collect_set(flag) flag from row2line group by id, tag ;
select id , tag , collect_list(flag) flag from row2line group by id, tag ;
select id , tag , sort_array(collect_set(flag)) flag from row2line group by id, tag ;
-- 2、将数据连接在一起
select id , tag , concat_ws('|' , collect_list(cast(flag as string))) as flag
from row2line
group by id, tag ;
-- 这里报错,CONCAT_WS must be "string or array<string>"。加一个类型转换即可
列转行
-- 数据:id course
1 java
1 hadoop
1 hive
1 hbase
2 java
2 hive
2 spark
2 flink
3 java
3 hadoop
3 hive
3 kafka
-- 编写sql,得到结果如下(1表示选修,0表示未选修)
id java hadoop hive hbase spark flink kafka
1 1 1 1 1 0 0 0
2 1 0 1 0 1 1 0
3 1 1 1 0 0 0 1
-- 建表加载数据
create table line2row(
id string,
course string
)row format delimited fields terminated by ' ';
load data local inpath '/root/data/data1.dat' into table line2row;
-- 使用case when;group by + sum
select id,
sum(case when course="java" then 1 else 0 end) as java,
sum(case when course="hadoop" then 1 else 0 end) as hadoop,
sum(case when course="hive" then 1 else 0 end) as hive,
sum(case when course="hbase" then 1 else 0 end) as hbase,
sum(case when course="spark" then 1 else 0 end) as spark,
sum(case when course="flink" then 1 else 0 end) as flink,
sum(case when course="kafka" then 1 else 0 end) as kafka
from line2row
group by id;
小结:3类典型问题,行列互转、TopN+行函数、连续值求解(row_number、gid、分组)
2、自定义函数
当 Hive 提供的内置函数无法满足实际的业务处理需要时,可以考虑使用用户自定义函数进行扩展。用户自定义函数分为以下三类:
- UDF(User Defined Function)。用户自定义函数,一进一出
- UDAF(User Defined Aggregation Function)。用户自定义聚集函数,多进一出;类似于:count/max/min
- UDTF(User Defined Table-Generating Functions)。用户自定义表生成函数,一进多出;类似于:explode
UDF开发:
- 继承org.apache.hadoop.hive.ql.exec.UDF
- 需要实现evaluate函数;evaluate函数支持重载
- UDF必须要有返回类型,可以返回null,但是返回类型不能为void
UDF开发步骤
- 创建maven java 工程,添加依赖
- 开发java类继承UDF,实现evaluate 方法
- 将项目打包上传服务器
- 添加开发的jar包
- 设置函数与自定义函数关联
- 使用自定义函数
需求:扩展系统 nvl 函数功能:
-- 系统内建的 nvl 函数
nvl(ename, "OK"): ename==null => 返回第二个参数
-- 要实现的函数功能
nvl(ename, "OK"): ename==null or ename=="" or ename==" " => 返回第二个参数
2.1、创建maven java 工程,添加依赖
<!-- pom.xml 文件 -->
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.7</version>
</dependency>
</dependencies>
2.2、开发java类继承UDF,实现evaluate 方法
package cn.lagou.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class nvl extends UDF {
public Text evaluate(final Text t, final Text x) {
if (t == null || t.toString().trim().length()==0) {
return x;
}
return t;
}
}
2.3、将项目打包上传服务器
2.4、添加开发的jar包(在Hive命令行中)
add jar /home/hadoop/hiveudf.jar;
2.5、创建临时函数。指定类名一定要完整的路径,即包名加类名
create temporary function mynvl as "cn.lagou.hive.udf.nvl";
2.6、执行查询
-- 基本功能还有
select mynvl(comm, 0) from mydb.emp;
-- 测试扩充的功能
select mynvl("", "OK");
select mynvl(" ", "OK");
2.7、退出Hive命令行,再进入Hive命令行。执行步骤6的测试,发现函数失效。
备注:创建临时函数每次进入Hive命令行时,都必须执行以下语句,很不方便:
add jar /home/hadoop/hiveudf.jar;
create temporary function mynvl as "cn.lagou.hive.udf.nvl";
2.8、创建永久函数:
(1)将jar上传HDFS
hdfs dfs -put hiveudf.jar jar/
(2)在Hive命令行中创建永久函数
create function mynvl1 as 'cn.lagou.hive.udf.nvl' using jar 'hdfs:/user/hadoop/jar/hiveudf.jar';
-- 查询所有的函数,发现 mynvl1 在列表中
show functions;
(3)退出Hive,再进入,执行测试
-- 基本功能还有
select mynvl(comm, 0) from mydb.emp;
-- 测试扩充的功能
select mynvl("", "OK");
select mynvl(" ", "OK");
(4)删除永久函数,并检查
drop function mynvl1;
show functions;