Spark题目练习

建表:

private val schema = StructType(Array(
    StructField("student_id", IntType, true),
    StructField("student_name", StringType, true),
    StructField("birth", StringType, true),
    StructField("sex", StringType, true)
  ))
val rdd = sc.makeRDD(Array(
(1 , "赵雷" , "1990-01-01" , "男"),
(2 , "钱电" , "1990-12-21" , "男"),
(3 , "孙风" , "1990-05-20" , "男"),
(4 , "李云" , "1990-08-06" , "男"),
(5 , "周梅" , "1991-12-01" , "女"),
(6 , "吴兰" , "1992-03-01" , "女"),
(7 , "郑竹" , "1989-07-01" , "女"),
(8 , "王菊" , "1990-01-20" , "女")))
private val studentDF: DataFrame = spark.createDataFrame(rdd,schema)
private val schema2 = StructType(Array(
    StructField("course_id", IntType, true),
    StructField("course_name", StringType, true),
    StructField("teacher_id", IntType, true)
  ))
val rdd2 = sc.makeRDD(Array(
(1 , "语文" , 2),
(2 , "数学" , 1),
(3 , "英语" , 3)
))
private val courseDF: DataFrame = spark.createDataFrame(rdd2,schema2)
private val schema3 = StructType(Array(
    StructField("teacher_id", IntType, true),
    StructField("teacher_name", StringType, true)
  ))
val rdd3 = sc.makeRDD(Array(
(1 , "张三"),
(2 , "李四"),
(3 , "王五")
))
private val teacherDF: DataFrame = spark.createDataFrame(rdd3,schema3)
private val schema3 = StructType(Array(
    StructField("student_id", IntType, true),
    StructField("course_id", IntType, true),
	StructField("score", IntType, true)
  ))
val rdd4 = sc.makeRDD(Array(
(1 , 1 , 80),
(1 , 2 , 90),
(1 , 3 , 99),
(2 , 1 , 70),
(2 , 2 , 60),
(2 , 3 , 80),
(3 , 1 , 80),
(3 , 2 , 80),
(3 , 3 , 80),
(4 , 1 , 50),
(4 , 2 , 30),
(4 , 3 , 20),
(5 , 1 , 76),
(5 , 2 , 87),
(6 , 1 , 31),
(6 , 3 , 34),
(7 , 2 , 89),
(7 , 3 , 98)))
private val scoreDF: DataFrame = spark.createDataFrame(rdd3,schema3)

import org.apache.spark.sql.functions._

//todo 1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数:
scoreDF.as("s1").join(scoreDF.as("s2"),"student_id").filter("s1.course_id=1 and s2.course_id=2 and s1.score>s2.score").join(studentDF,"student_id").show
//todo 2.查询"01"课程比"02"课程成绩低的学生的信息及课程分数:
scoreDF.as("s1").join(scoreDF.as("s2"),"student_id").filter("s1.course_id=1 and s2.course_id=2 and s1.score<s2.score").join(studentDF,"student_id").show
//todo 3.查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩
scoreDF.as("s1").groupBy("student_id").avg("score").join(studentDF.as("s2"),"student_id").filter($"avg(score)">=60).show
//todo 4.查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:(包括有成绩的和无成绩的)
studentDF.as("s2").join((scoreDF.as("s1").groupBy("student_id").avg("score")).as("s3"),Seq("student_id"),"left_outer").as("s").withColumnRenamed("avg(score)","A").where((col("A")<60) || (col("A").isNull)).show
//todo 5.查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:
//todo 选课数
scoreDF.groupBy("student_id").count
//todo 总成绩
scoreDF.groupBy("student_id").sum("score")
//todo 连表
studentDF.join(scoreDF.groupBy("student_id").count,Seq("student_id"),"left_outer").join(scoreDF.groupBy("student_id").sum("score"),Seq("student_id"),"left_outer").show
//todo 6.查询"李"姓老师的数量:
teacherDF.where("teacher_name like '李%'").select("teacher_id").count
//todo 7.查询学过"张三"老师授课的同学的信息:
scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").filter("teacher_name = '张三'").join(studentDF,"student_id").show
//todo 8.查询没学过"张三"老师授课的同学的信息:
studentDF.join(scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id"),Seq("student_id"),"left_outer").where("teacher_name!='张三' or teacher_name is null").show
//todo 9.查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息:
scoreDF.where("course_id in(1,2)").groupBy("student_id").count.where("count=2").join(studentDF,"student_id").show
//todo 10.查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息:
studentDF.join(scoreDF.where("course_id in (2)"),Seq("student_id"),"left_outer").as("s1").where("s1.course_id is null").join(scoreDF.where("course_id in (1)"),"student_id").show
//todo 11、查询没有学全所有课程的同学的信息:
studentDF.join(scoreDF,Seq("student_id"),"left_outer").groupBy("student_id").count.where(s"count != ${courseDF.select("course_id").count}").join(studentDF,"student_id").show
//todo 12、查询至少有一门课与学号为"01"的同学所学相同的同学的信息:
studentDF.as("a").join(scoreDF.as("c"),"student_id").as("d").join(scoreDF.where("student_id=1").as("b"),"course_id").select("d.student_id").distinct.where("student_id!=1").join(studentDF,"student_id").show
//todo 13.查询和"01"号的同学学习的课程完全相同的其他同学的信息:
scoreDF.where("student_id=1").as("s1").join(scoreDF.as("s2"),"course_id").groupBy("s2.student_id").count.as("s3").where(s"count = ${scoreDF.where("student_id=1").count} and student_id!=1").join(studentDF,"student_id").show
//todo 14、查询没学过"张三"老师讲授的任一门课程的学生姓名:
studentDF.join(teacherDF.where("teacher_name='张三'").join(courseDF,"teacher_id").join(scoreDF,Seq("course_id"),"left_outer"),Seq("student_id"),"left_outer").as("s1").where("s1.teacher_id is null").show
//todo 15、查询两门及其以上不及格课程的同学的学号,姓名及其平均成绩:
scoreDF.where("score<60").groupBy("student_id").count.where("count>=2").as("s1").join(scoreDF.as("s2"),"student_id").groupBy("student_id").avg("score").join(studentDF,"student_id").show
//todo 16、检索"01"课程分数小于60,按分数降序排列的学生信息:
scoreDF.where("course_id=1 and score<60").join(studentDF,"student_id").orderBy($"score".desc).show
//todo 17、按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩:
scoreDF.join(scoreDF.groupBy("student_id").avg("score"),Seq("student_id"),"left_outer").join(studentDF,"student_id").orderBy($"avg(score)".desc).show
//todo 18.查询各科成绩最高分、最低分和平均分:以如下形式显示:课程ID,课程name,最高分,最低分,平均分,及格率,中等率,优良率,优秀率:
val jige = scoreDF.rdd.map(x=>{if(x.getAs("score").toString.toInt > 60) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","jige")
val zhongdeng = scoreDF.rdd.map(x=>{if(x.getAs("score").toString.toInt > 70) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","zhongdeng")
val youliang = scoreDF.rdd.map(x=>{if(x.getAs("score").toString.toInt > 80) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","youliang")
val youxiu = scoreDF.rdd.map(x=>{if(x.getAs("score").toString.toInt > 90) (x(1).toString,1) else (x(1).toString,0)}).reduceByKey(_+_).toDF("course_id","youxiu")
val s1 = scoreDF.groupBy("course_id").agg("score"->"max","score"->"min","score"->"avg","score"->"count")
s1.join(jige,"course_id").join(zhongdeng,"course_id").join(youliang,"course_id").join(youxiu,"course_id").withColumn("jgl",$"jige"/$"count(score)").withColumn("zdl",$"zhongdeng"/$"count(score)").withColumn("yll",$"youliang"/$"count(score)").withColumn("yxl",$"youxiu"/$"count(score)").drop("jige","zhongdeng","youliang","youxiu").show
//todo 19、按各科成绩进行排序,并显示排名:
scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").show
//todo 20、查询学生的总成绩并进行排名:
scoreDF.selectExpr("*","sum(score) over(partition by student_id) as sum_score").dropDuplicates("student_id","sum_score").selectExpr("*","row_number() over(order by sum_score desc) rank").show
//todo 21、查询不同老师所教不同课程平均分从高到低显示:
scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").groupBy("teacher_id","course_id").avg("score").orderBy($"avg(score)".desc).show
//todo 22、查询所有课程的成绩第2名到第3名的学生信息及该课程成绩:
scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank between 2 and 3").join(studentDF,"student_id").show
//todo 23.统计各科成绩各分数段人数:课程编号,课程名称,[100-85],[85-70],[70-60],[0-60]及所占百分比
val fenduan = scoreDF.rdd.map(x=>{
if(x.getAs("score").toString.toInt < 60) (x(1).toString,1)
else if(x.getAs("score").toString.toInt < 70) (x(1).toString,2)
else if(x.getAs("score").toString.toInt < 85) (x(1).toString,3)
else (x(1).toString,4)
}).toDF("course_id","fenduan")
fenduan.groupBy("course_id").count.as("f1").join(fenduan.groupBy("course_id","fenduan").count.as("f2"),"course_id").withColumn("rate",$"f2.count"/$"f1.count").drop($"f1.count").join(courseDF,"course_id").show
//todo 24、查询学生平均成绩及其名次:
scoreDF.groupBy("student_id").avg("score").selectExpr("*",s"row_number() over(order by 'avg(score)' desc) as rank").show
//todo 25、查询各科成绩前三名的记录
scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank <=3").show
//todo 26、查询每门课程被选修的学生数:
scoreDF.groupBy("course_id").count.show
//todo 27.查询出只有两门课程的全部学生的学号和姓名:
scoreDF.groupBy("student_id").count.where("count=2").join(studentDF,"student_id").show
//todo 28、查询男生、女生人数:
studentDF.groupBy("sex").count.show
//todo 29、查询名字中含有"风"字的学生信息:
studentDF.where("student_name like '%风%'").show
//todo 30、查询同名同姓学生名单,并统计同名人数:
studentDF.groupBy("student_name").count.where("count>1").show
//todo 31、查询1990年出生的学生名单:
studentDF.where("year(birth) = 1990").show
//todo 32、查询每门课程的平均成绩,结果按平均成绩降序排列,平均成绩相同时,按课程编号升序排列:
scoreDF.groupBy("course_id").avg("score").orderBy($"avg(score)".desc).orderBy($"course_id").show
//todo 33、查询平均成绩大于等于85的所有学生的学号、姓名和平均成绩:
scoreDF.groupBy("student_id").avg("score").where("avg(score)>=85").join(studentDF,"student_id").show
//todo 34、查询课程名称为"数学",且分数低于60的学生姓名和分数:
scoreDF.where("score<60").join(courseDF,"course_id").where("course_name='数学'").show
//todo 35、查询所有学生的课程及分数情况:
studentDF.join(scoreDF,Seq("student_id"),"left_outer").show
//todo 36.查询任何一门课程成绩在70分以上的学生姓名、课程名称和分数:
scoreDF.where("score>70").join(studentDF,"student_id").join(courseDF,"course_id").show
//todo 37、查询课程不及格的学生:
scoreDF.where("score<60").join(studentDF,"student_id").show
//todo 38、查询课程编号为01且课程成绩在80分以上的学生的学号和姓名:
scoreDF.where("course_id=1 and score>=80").join(studentDF,"student_id").show
//todo 39.求每门课程的学生人数:
scoreDF.groupBy("course_id").count.show
//todo 40、查询选修"张三"老师所授课程的学生中,成绩最高的学生信息及其成绩:
scoreDF.join(courseDF,"course_id").join(teacherDF,"teacher_id").where("teacher_name='张三'").selectExpr("*","max(score) over() max_score").where("max_score=score").show
//todo 41、查询不同课程成绩相同的学生的学生编号、课程编号、学生成绩:
scoreDF.as("s1").crossJoin(scoreDF.as("s2")).where("s1.score=s2.score and s1.course_id!=s2.course_id").show
//todo 42、查询每门课程成绩最好的前三名:
scoreDF.selectExpr("*","row_number() over(partition by course_id order by score desc) rank").where("rank<=3").show
//todo 43、统计每门课程的学生选修人数(超过5人的课程才统计)要求输出课程号和选修人数,查询结果按人数降序排列,若人数相同,按课程号升序排列
scoreDF.selectExpr("*","count(1) over(partition by course_id) cnt").where("cnt>=5").orderBy($"cnt".desc).orderBy("course_id").drop("student_id","score").dropDuplicates("course_id","cnt").show
//todo 44、检索至少选修两门课程的学生学号:
scoreDF.groupBy("student_id").count.where("count>=2").show
//todo 45、查询选修了全部课程的学生信息:
studentDF.join(scoreDF,Seq("student_id"),"left_outer").groupBy("student_id").count.where(s"count = ${courseDF.select("course_id").count}").join(studentDF,"student_id").show
//todo 46、查询各学生的年龄(周岁):
studentDF.selectExpr("*","cast(date_format(current_date(),'yyyy') as Int)-cast(date_format(birth,'yyyy') as Int) age").show
//todo 47、查询本周过生日的学生:找到下周一-1即为本周最后一天,开始时间为当前天(若今天就是星期天会不会出错?)
studentDF.where("unix_timestamp(cast(concat_ws('-',date_format(current_date(),'yyyy'),date_format(birth,'MM'),date_format(birth,'dd')) as date),'yyyy-MM-dd') between unix_timestamp(current_date()) and unix_timestamp(date_sub(next_day(current_date(),'MON'),1),'yyyy-MM-dd')").show
//todo 48、查询下周过生日的学生: 下周1到+6天
studentDF.where("unix_timestamp(cast(concat_ws('-',date_format(current_date(),'yyyy'),date_format(birth,'MM'),date_format(birth,'dd')) as date),'yyyy-MM-dd')between unix_timestamp(next_day(current_date(),'MON'),'yyyy-MM-dd') and unix_timestamp(date_add(next_day(current_date(),'MON'),6),'yyyy-MM-dd')").show
//todo 49、查询本月过生日的学生:
studentDF.where("month(birth) = month(current_date())").show
//todo 50、查询12月份过生日的学生:
studentDF.where("month(birth) = 12").show
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值