SparkCore 试题

SparkCore 试题

1. 连续登录天数 及日期

 * cs,2020-09-20
 * cs,2020-09-20
 * cs,2020-09-21
 * cs,2020-09-22
 * cs,2020-09-24
 * cs,2020-09-25
 * go,2020-09-20
 * go,2020-09-22
 * go,2020-09-24
 * go,2020-09-25
def process(sc:SparkContext): Unit ={
    val rdd = sc.textFile("data/input/logging")

    val groupRDD = rdd.map(x => {
      val splits = x.split(",")
      (splits(0), splits(1))
    }).groupByKey()


    val personDateRdd = groupRDD.flatMapValues(line => {
      val sorted = line.toSet.toList.sorted

      val calendar = Calendar.getInstance()
      val sdf = new SimpleDateFormat("yyyy-MM-dd")
      var index = 0

      sorted.map(desStr => {
        val time = sdf.parse(desStr)
        calendar.setTime(time)
        calendar.add(Calendar.DATE, -index)
        index += 1
        (desStr, sdf.format(calendar.getTime))
      })
    })

    personDateRdd.map(x => {
      ((x._1,x._2._2), x._2._1)
    }).groupByKey().mapValues(x=> {
      val list = x.toList
      (x.size,x.head,x.last)
    }).map(x => {
      (x._1._1,x._2._1,x._2._2,x._2._3)
    }).foreach(println)

2. TOPN sql 1

create table people(
id int,
sex string,
age int,
name string
) 
row format delimited fields terminated by ',';
mysql> desc people;
+-------+--------------+------+-----+---------+----------------+
| Field | Type         | Null | Key | Default | Extra          |
+-------+--------------+------+-----+---------+----------------+
| id    | int(4)       | NO   | PRI | NULL    | auto_increment |
| sex   | varchar(14)  | YES  |     | NULL    |                |
| age   | decimal(4,0) | YES  |     | NULL    |                |
| name  | varchar(14)  | YES  |     | NULL    |                |
+-------+--------------+------+-----+---------+----------------+
1,female,18,zhangsan
2,male,18,lisi
3,male,20,wangwu
4,female,18,zhaoliu
5,female,18,sunqi
6,female,19,zhouba
7,female,38,wujiu
8,male,22,zhengshi
9,male,23,tianyu
10,male,28,zhangbailu
11,male,18,fengnan
按照性别进行分组,求分组后最大的两个年龄的人

select * from 
(
select 
id, name, age, sex, 
row_number() over (partition by sex order by age) rank
from people) a where a.rank <3;

3. TOPN sql 2

create table stu_skills(
id int,
name string,
age int,
skills string
) 
row format delimited fields terminated by ',';
1,pk001,35,Spark-CDH-Hive-Hadoop
2,pk002,37,Flink-CDH-Hive-Hadoop
3,pk003,32,CDH-Hive-Hadoop
4,pk004,21,Spark-Hive-Hadoop
5,pk005,25,Hadoop-HBase-Hive
6,pk006,28,Spark-Hive-Hadoop
按skill分组,求年龄最大
select
b.*
from(
select a.*, row_number() over (partition by a.skill order by a.age desc) r
from(
select
id, name,age,
explode(split(skills,"-")) skill
from stu_skills) a
) b where b.r <=2;

4. 行列转换

1,spark
1,flink
1,cdh
1,hive
2,spark
2,cdh
2,kafka
2,flume
3,spark
3,flink
3,cdh
3,hive
create table stu_course(
id int,
course string
) row format delimited fields terminated by ',';

使用case when

select 
t.id,
max(t.spark) as spark ,
max(t.flink) as flink,
max(t.cdh) as cdh,
max(t.hive) as hive,
max(t.kafka) as kafka,
max(t.flume) as flume
from 
(
select 
id,
case when course='spark' then 1 else 0 end as spark,
case when course='flink' then 1 else 0 end as flink,
case when course='cdh' then 1 else 0 end as cdh,
case when course='hive' then 1 else 0 end  as hive,
case when course='kafka' then 1 else 0 end as kafka,
case when course='flume' then 1 else 0 end as flume
from stu_course
) t group by t.id;

===>

id      spark   flink   cdh     hive    kafka   flume
1       1       1       1       1       0       0
3       1       1       1       1       0       0
2       1       0       1       0       1       1

高级版

select collect_set(course) as course from stu_course;

course
["cdh","hive","flink","spark","flume","kafka"]

select id, collect_set(course) as course from stu_course group by id;

id      course
1       ["cdh","hive","flink","spark"]
3       ["cdh","hive","flink","spark"]
2       ["cdh","spark","flume","kafka"]

create table tmp1 as 
select 
b.id, b.course, a.courses
from 
(select collect_set(course) as courses from stu_course) a
join  
(select id, collect_set(course) as course from stu_course group by id) b;

id      course  courses
1       ["cdh","hive","flink","spark"]  ["cdh","hive","flink","spark","flume","kafka"]
3       ["cdh","hive","flink","spark"]  ["cdh","hive","flink","spark","flume","kafka"]
2       ["cdh","spark","flume","kafka"] ["cdh","hive","flink","spark","flume","kafka"]

匹配

select 
id,
case when array_contains(course, courses[0]) then 1 else 0 end as cdh,
case when array_contains(course, courses[1]) then 1 else 0 end as hive,
case when array_contains(course, courses[2]) then 1 else 0 end as flink,
case when array_contains(course, courses[3]) then 1 else 0 end as spark,
case when array_contains(course, courses[4]) then 1 else 0 end as flume,
case when array_contains(course, courses[5]) then 1 else 0 end as kafka
from tmp1;

5. 连续出现三次

1,1
2,1
3,1
4,2
5,1
6,2
7,2
8,3
9,3
10,3
11,3
12,2
13,4
13,5

create table series(
id int,
number int
) row format delimited fields terminated by ',';

最土得方法

select
distinct(a.number)
from series a 
join series b
on a.id = b.id-1
join series c
on b.id = c.id-1
where a.number = b.number and b.number = c.number; 

窗口

select
distinct(a.number)
from 
(select
id, number,
lag(id,2) over(partition by number order by id) prev
from series) a  where a.id = a.prev+2;

没有id得数据

1
1
1
2
1
2
2
3
3
3
3
2
4
5

create table consecutive(
number int
) row format delimited fields terminated by ',';

select number ,row_number () over(order by number) r from series;

select
distinct(a.number)
from 
(select
b.r, number,
lag(b.r,2) over(partition by b.number order by b.r) prev
from 
(
select number ,row_number () over(order by null) r from consecutive
) b
) a  where a.r= a.prev+2;

6. 每年总成绩都有提升得学生

create table score(
year int,
course string,
student string,
score int
) row format delimited fields terminated by ',';

2020,chinese,A,83
2020,math,A,58
2020,english,A,29
2020,chinese,B,83
2020,math,B,77
2020,english,B,67
2021,chinese,A,50
2021,math,A,93
2021,english,A,70
2021,chinese,B,86
2021,math,B,43
2021,english,B,39
2022,chinese,A,90
2022,math,A,49
2022,english,A,70
2022,chinese,B,86
2022,math,B,83
2022,english,B,97
问题1:
每年每门学科排名第一得学生
group by year,course

select
a.year, a.course, b.student
from
(select year,course,max(score) max_score
from score group by year,course) a
join score b
where a.year = b.year and a.course = b.course and a.max_score = b.score;

窗口函数实现

select
a.year,a.course,a.student
from
(
select
year,course,student,score,
max(score) over(partition by year,course) max_score
from score
) a where a.score = a.max_score;

第二种方式
select
distinct year,course,
first_value(student) over(partition by year,course order by score) student
from score order by year;

问题2:

每年总分都有上涨得

select
student from 
(
select 
a.year,a.student,a.sum_score,
if((a.sum_score - lag(a.sum_score,1,0) over (partition by a.student order by a.year))>1,1,0) f2
from
(select year,student, sum(score) sum_score from score group by year,student) a) b group by b.student having avg(f2)=1

7. 统计每个用户最大连续登陆天数

pk,20210801
pk,20210802
pk,20210803
pk,20210804
pk,20210806
pk,20210807
pk,20210808
pk,20210811
pk,20210812
ruoze,20210730
ruoze,20210731
ruoze,20210801
ruoze,20210804
ruoze,20210806

create table login(
name string,
day string
) row format delimited fields terminated by ',';
求年度最大连续登录天数

如何获得第几天得天数

select 
b.name, max(b.consecutive_day)
from
(select
a.name, count(1) consecutive_day
from
(
select name,day,
dayofyear(from_unixtime(unix_timestamp(day,'yyyyMMdd'),'yyyy-MM-dd')) - row_number() over(partition by name order by day) diff
from login
) a group by a.name,a.diff) b group by b.name;
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值