SparkCore 试题
1. 连续登录天数 及日期
* cs,2020-09-20
* cs,2020-09-20
* cs,2020-09-21
* cs,2020-09-22
* cs,2020-09-24
* cs,2020-09-25
* go,2020-09-20
* go,2020-09-22
* go,2020-09-24
* go,2020-09-25
def process(sc:SparkContext): Unit ={
val rdd = sc.textFile("data/input/logging")
val groupRDD = rdd.map(x => {
val splits = x.split(",")
(splits(0), splits(1))
}).groupByKey()
val personDateRdd = groupRDD.flatMapValues(line => {
val sorted = line.toSet.toList.sorted
val calendar = Calendar.getInstance()
val sdf = new SimpleDateFormat("yyyy-MM-dd")
var index = 0
sorted.map(desStr => {
val time = sdf.parse(desStr)
calendar.setTime(time)
calendar.add(Calendar.DATE, -index)
index += 1
(desStr, sdf.format(calendar.getTime))
})
})
personDateRdd.map(x => {
((x._1,x._2._2), x._2._1)
}).groupByKey().mapValues(x=> {
val list = x.toList
(x.size,x.head,x.last)
}).map(x => {
(x._1._1,x._2._1,x._2._2,x._2._3)
}).foreach(println)
2. TOPN sql 1
create table people(
id int,
sex string,
age int,
name string
)
row format delimited fields terminated by ',';
mysql> desc people;
+-------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+-------+--------------+------+-----+---------+----------------+
| id | int(4) | NO | PRI | NULL | auto_increment |
| sex | varchar(14) | YES | | NULL | |
| age | decimal(4,0) | YES | | NULL | |
| name | varchar(14) | YES | | NULL | |
+-------+--------------+------+-----+---------+----------------+
1,female,18,zhangsan
2,male,18,lisi
3,male,20,wangwu
4,female,18,zhaoliu
5,female,18,sunqi
6,female,19,zhouba
7,female,38,wujiu
8,male,22,zhengshi
9,male,23,tianyu
10,male,28,zhangbailu
11,male,18,fengnan
按照性别进行分组,求分组后最大的两个年龄的人
select * from
(
select
id, name, age, sex,
row_number() over (partition by sex order by age) rank
from people) a where a.rank <3;
3. TOPN sql 2
create table stu_skills(
id int,
name string,
age int,
skills string
)
row format delimited fields terminated by ',';
1,pk001,35,Spark-CDH-Hive-Hadoop
2,pk002,37,Flink-CDH-Hive-Hadoop
3,pk003,32,CDH-Hive-Hadoop
4,pk004,21,Spark-Hive-Hadoop
5,pk005,25,Hadoop-HBase-Hive
6,pk006,28,Spark-Hive-Hadoop
按skill分组,求年龄最大
select
b.*
from(
select a.*, row_number() over (partition by a.skill order by a.age desc) r
from(
select
id, name,age,
explode(split(skills,"-")) skill
from stu_skills) a
) b where b.r <=2;
4. 行列转换
1,spark
1,flink
1,cdh
1,hive
2,spark
2,cdh
2,kafka
2,flume
3,spark
3,flink
3,cdh
3,hive
create table stu_course(
id int,
course string
) row format delimited fields terminated by ',';
使用case when
select
t.id,
max(t.spark) as spark ,
max(t.flink) as flink,
max(t.cdh) as cdh,
max(t.hive) as hive,
max(t.kafka) as kafka,
max(t.flume) as flume
from
(
select
id,
case when course='spark' then 1 else 0 end as spark,
case when course='flink' then 1 else 0 end as flink,
case when course='cdh' then 1 else 0 end as cdh,
case when course='hive' then 1 else 0 end as hive,
case when course='kafka' then 1 else 0 end as kafka,
case when course='flume' then 1 else 0 end as flume
from stu_course
) t group by t.id;
===>
id spark flink cdh hive kafka flume
1 1 1 1 1 0 0
3 1 1 1 1 0 0
2 1 0 1 0 1 1
高级版
select collect_set(course) as course from stu_course;
course
["cdh","hive","flink","spark","flume","kafka"]
select id, collect_set(course) as course from stu_course group by id;
id course
1 ["cdh","hive","flink","spark"]
3 ["cdh","hive","flink","spark"]
2 ["cdh","spark","flume","kafka"]
create table tmp1 as
select
b.id, b.course, a.courses
from
(select collect_set(course) as courses from stu_course) a
join
(select id, collect_set(course) as course from stu_course group by id) b;
id course courses
1 ["cdh","hive","flink","spark"] ["cdh","hive","flink","spark","flume","kafka"]
3 ["cdh","hive","flink","spark"] ["cdh","hive","flink","spark","flume","kafka"]
2 ["cdh","spark","flume","kafka"] ["cdh","hive","flink","spark","flume","kafka"]
匹配
select
id,
case when array_contains(course, courses[0]) then 1 else 0 end as cdh,
case when array_contains(course, courses[1]) then 1 else 0 end as hive,
case when array_contains(course, courses[2]) then 1 else 0 end as flink,
case when array_contains(course, courses[3]) then 1 else 0 end as spark,
case when array_contains(course, courses[4]) then 1 else 0 end as flume,
case when array_contains(course, courses[5]) then 1 else 0 end as kafka
from tmp1;
5. 连续出现三次
1,1
2,1
3,1
4,2
5,1
6,2
7,2
8,3
9,3
10,3
11,3
12,2
13,4
13,5
create table series(
id int,
number int
) row format delimited fields terminated by ',';
最土得方法
select
distinct(a.number)
from series a
join series b
on a.id = b.id-1
join series c
on b.id = c.id-1
where a.number = b.number and b.number = c.number;
窗口
select
distinct(a.number)
from
(select
id, number,
lag(id,2) over(partition by number order by id) prev
from series) a where a.id = a.prev+2;
没有id得数据
1
1
1
2
1
2
2
3
3
3
3
2
4
5
create table consecutive(
number int
) row format delimited fields terminated by ',';
select number ,row_number () over(order by number) r from series;
select
distinct(a.number)
from
(select
b.r, number,
lag(b.r,2) over(partition by b.number order by b.r) prev
from
(
select number ,row_number () over(order by null) r from consecutive
) b
) a where a.r= a.prev+2;
6. 每年总成绩都有提升得学生
create table score(
year int,
course string,
student string,
score int
) row format delimited fields terminated by ',';
2020,chinese,A,83
2020,math,A,58
2020,english,A,29
2020,chinese,B,83
2020,math,B,77
2020,english,B,67
2021,chinese,A,50
2021,math,A,93
2021,english,A,70
2021,chinese,B,86
2021,math,B,43
2021,english,B,39
2022,chinese,A,90
2022,math,A,49
2022,english,A,70
2022,chinese,B,86
2022,math,B,83
2022,english,B,97
问题1:
每年每门学科排名第一得学生
group by year,course
select
a.year, a.course, b.student
from
(select year,course,max(score) max_score
from score group by year,course) a
join score b
where a.year = b.year and a.course = b.course and a.max_score = b.score;
窗口函数实现
select
a.year,a.course,a.student
from
(
select
year,course,student,score,
max(score) over(partition by year,course) max_score
from score
) a where a.score = a.max_score;
第二种方式
select
distinct year,course,
first_value(student) over(partition by year,course order by score) student
from score order by year;
问题2:
每年总分都有上涨得
select
student from
(
select
a.year,a.student,a.sum_score,
if((a.sum_score - lag(a.sum_score,1,0) over (partition by a.student order by a.year))>1,1,0) f2
from
(select year,student, sum(score) sum_score from score group by year,student) a) b group by b.student having avg(f2)=1
7. 统计每个用户最大连续登陆天数
pk,20210801
pk,20210802
pk,20210803
pk,20210804
pk,20210806
pk,20210807
pk,20210808
pk,20210811
pk,20210812
ruoze,20210730
ruoze,20210731
ruoze,20210801
ruoze,20210804
ruoze,20210806
create table login(
name string,
day string
) row format delimited fields terminated by ',';
求年度最大连续登录天数
如何获得第几天得天数
select
b.name, max(b.consecutive_day)
from
(select
a.name, count(1) consecutive_day
from
(
select name,day,
dayofyear(from_unixtime(unix_timestamp(day,'yyyyMMdd'),'yyyy-MM-dd')) - row_number() over(partition by name order by day) diff
from login
) a group by a.name,a.diff) b group by b.name;