代码中所用到的数据
部分score数据:
1500100001,1000001,98 1500100001,1000002,5 1500100001,1000003,137 1500100001,1000004,29 1500100001,1000005,85 1500100001,1000006,52 1500100002,1000001,139 1500100002,1000002,102 1500100002,1000003,44 1500100002,1000004,18 1500100002,1000005,46 1500100002,1000006,91
部分student数据:
1500100001,施笑槐,22,女,文科六班 1500100002,吕金鹏,24,男,文科六班 1500100003,单乐蕊,22,女,理科六班 1500100004,葛德曜,24,男,理科三班 1500100005,宣谷芹,22,女,理科五班 1500100006,边昂雄,21,男,理科二班 1500100007,尚孤风,23,女,文科六班 1500100008,符半双,22,女,理科六班
1、统计年级排名前十学生各科的分数 [学号,学生姓名,学生班级,科目名,分数]
object test1 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
val sc = new SparkContext(conf)
val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")
/**
* 1、统计年级排名前十学生各科的分数 [学号,学生姓名,学生班级,科目名,分数]
*/
val scoreInfo: RDD[(String, Int)] = scoreRDD.map(f = line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val score: Int = lines(2).toInt
(id, score)
})
val score_reduceRDD: RDD[(String, Int)] = scoreInfo.reduceByKey((x, y) => x + y)
val ids: Array[String] = score_reduceRDD.sortBy(kv => -kv._2,true).take(10)
.map(line => line._1)
val broadcast: Broadcast[Array[String]] = sc.broadcast(ids)
val filteRDD: RDD[String] = stuRDD.filter(line => {
val id: String = line.split(",")(0)
val value: Array[String] = broadcast.value
value.contains(id)
})
val kv_RDD = filteRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val name: String = lines(1)
val clazz: String = lines(4)
(id, (name, clazz))
})
val scoreInfo2 = scoreRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val subject: String = lines(1)
val score = lines(2).toInt
(id, (score, subject))
})
val resultRDD = kv_RDD.join(scoreInfo2)
resultRDD.map {
case (id: String, ((name: String, clazz: String), (score: Int, subject: String))) => {
(id,name,clazz,subject,score)
}
}.foreach(println)
}
}
2、统计总分大于年级平均分的学生 [学号,姓名,班级,总分]
注:在写代码的过程中遇到一个问题:RDD不能点出contains方法, 需要通过collect算子将RDD转换成数组,然后调用contains方法进行filter操作
object test2 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
val sc = new SparkContext(conf)
val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")
/**
* 2、统计总分大于年级平均分的学生 [学号,姓名,班级,总分]
*/
//学生的总分
val sumscoreInfo: RDD[(String, Int)] = scoreRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val score = lines(2).toInt
(id, score)
}).reduceByKey(_ + _)
//年级的平均分
val socre_avg: Double = scoreRDD.map(line => line.split(",")(2).toInt).sum() / stuRDD.count()
//过滤出来的IDs
val filter_IDs: RDD[(String, Int)] = sumscoreInfo.filter(line => {
line._2 > socre_avg
})
val ids = filter_IDs.map(line => line._1).collect()
//RDD不能点出contains 需要转换成数组 collect
val filterRDD = stuRDD.filter(line => {
val id: String = line.split(",")(0)
ids.contains(id)
})
val resultRDD = filterRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val name: String = lines(1)
val clazz: String = lines(4)
(id, (name, clazz))
})
val joinRDD: RDD[(String, ((String, String), Int))] = resultRDD.join(sumscoreInfo)
joinRDD.map {
case (id: String, ((name: String, clazz: String), score: Int)) => (id, name, clazz, score)
}.foreach(println)
}
}
3、统计每科都及格的学生 [学号,姓名,班级,科目,分数]
object test3 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
val sc = new SparkContext(conf)
val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")
/**
* 3、统计每科都及格的学生 [学号,姓名,班级,科目,分数]
*/
val filterRDD = scoreRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val score: Int = lines(2).toInt
(id, score)
}).filter(kv => kv._2 > 60)
val groupRDD: RDD[(String, Iterable[(String, Int)])] = filterRDD.groupBy(kv => kv._1)
val ids: Array[String] = groupRDD.filter(line => line._2.size == 6).map(line => line._1).collect()
val resultRDD = stuRDD.filter(line => ids.contains(line.split(",")(0)))
.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val name: String = lines(1)
val clazz: String = lines(4)
(id, (name, clazz))
})
val resultRDD2 = scoreRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val subject: String = lines(1)
val score: String = lines(2)
(id, (subject, score))
})
val joinRDD: RDD[(String, ((String, String), Option[(String, String)]))] = resultRDD.leftOuterJoin(resultRDD2)
joinRDD.map {
case (id: String, ((name: String, clazz: String), Some((subject: String, score: String)))) => (id, name, clazz, subject, score)
}.foreach(println)
}
}
4、统计偏科最严重的前100名学生 [学号,姓名,班级,科目,分数]
object test4 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
val sc = new SparkContext(conf)
val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")
/**
* 4、统计偏科最严重的前100名学生 [学号,姓名,班级,科目,分数] 如何求方差
*/
val scoreInfo = scoreRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val score = lines(2).toInt
(id, score)
})
val groupRDD = scoreInfo.groupByKey
val covRDD = groupRDD.map(line => {
val id: String = line._1
val scores: Iterable[Int] = line._2
val avg_score: Int = scores.sum / 6
val cov = scores.map(line => {
(line - avg_score) * (line - avg_score)
}).sum / 6
(id, cov)
})
val ids: Array[String] = covRDD.sortBy(kv => kv._2, false).take(100).map(kv => kv._1)
val filterRDD = stuRDD.filter(line => {
val id: String = line.split(",")(0)
ids.contains(id)
})
val resultRDD = filterRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val name: String = lines(1)
val clazz: String = lines(4)
(id, (name, clazz))
})
val rsRDD = scoreRDD.map(line => {
val lines: Array[String] = line.split(",")
val id: String = lines(0)
val subject: String = lines(1)
val score: Int = lines(2).toInt
(id, (subject, score))
})
val joinRDD: RDD[(String, ((String, String), Option[(String, Int)]))] = resultRDD.leftOuterJoin(rsRDD)
joinRDD.map {
case (id: String, ((name: String, clazz: String), Some((subject: String, score: Int)))) => (id, name, clazz, subject, score)
}.foreach(println)
}
}