利用Spark算子解决问题

代码中所用到的数据

部分score数据:

1500100001,1000001,98
1500100001,1000002,5
1500100001,1000003,137
1500100001,1000004,29
1500100001,1000005,85
1500100001,1000006,52
1500100002,1000001,139
1500100002,1000002,102
1500100002,1000003,44
1500100002,1000004,18
1500100002,1000005,46
1500100002,1000006,91

 部分student数据:

1500100001,施笑槐,22,女,文科六班
1500100002,吕金鹏,24,男,文科六班
1500100003,单乐蕊,22,女,理科六班
1500100004,葛德曜,24,男,理科三班
1500100005,宣谷芹,22,女,理科五班
1500100006,边昂雄,21,男,理科二班
1500100007,尚孤风,23,女,文科六班
1500100008,符半双,22,女,理科六班

 1、统计年级排名前十学生各科的分数 [学号,学生姓名,学生班级,科目名,分数]

object test1 {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
    val sc = new SparkContext(conf)
    val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
    val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")

    /**
      * 1、统计年级排名前十学生各科的分数 [学号,学生姓名,学生班级,科目名,分数]
      */
    val scoreInfo: RDD[(String, Int)] = scoreRDD.map(f = line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val score: Int = lines(2).toInt
      (id, score)
    })
    val score_reduceRDD: RDD[(String, Int)] = scoreInfo.reduceByKey((x, y) => x + y)


    val ids: Array[String] = score_reduceRDD.sortBy(kv => -kv._2,true).take(10)
      .map(line => line._1)




    val broadcast: Broadcast[Array[String]] = sc.broadcast(ids)

    val filteRDD: RDD[String] = stuRDD.filter(line => {
      val id: String = line.split(",")(0)
      val value: Array[String] = broadcast.value
      value.contains(id)
    })

    val kv_RDD = filteRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val name: String = lines(1)
      val clazz: String = lines(4)
      (id, (name, clazz))
    })


    val scoreInfo2 = scoreRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val subject: String = lines(1)
      val score = lines(2).toInt
      (id, (score, subject))
    })
    val resultRDD = kv_RDD.join(scoreInfo2)
    resultRDD.map {
      case (id: String, ((name: String, clazz: String), (score: Int, subject: String))) => {
        (id,name,clazz,subject,score)
      }
    }.foreach(println)
  }
}

2、统计总分大于年级平均分的学生 [学号,姓名,班级,总分]

注:在写代码的过程中遇到一个问题:RDD不能点出contains方法, 需要通过collect算子将RDD转换成数组,然后调用contains方法进行filter操作

object test2 {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
    val sc = new SparkContext(conf)
    val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
    val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")

    /**
      * 2、统计总分大于年级平均分的学生 [学号,姓名,班级,总分]
      */
    //学生的总分
    val sumscoreInfo: RDD[(String, Int)] = scoreRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val score = lines(2).toInt
      (id, score)
    }).reduceByKey(_ + _)
    //年级的平均分
    val socre_avg: Double = scoreRDD.map(line => line.split(",")(2).toInt).sum() / stuRDD.count()

    //过滤出来的IDs
    val filter_IDs: RDD[(String, Int)] = sumscoreInfo.filter(line => {
      line._2 > socre_avg
    })

    val ids = filter_IDs.map(line => line._1).collect()
    //RDD不能点出contains  需要转换成数组 collect

    val filterRDD = stuRDD.filter(line => {
      val id: String = line.split(",")(0)
      ids.contains(id)
    })
    val resultRDD = filterRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val name: String = lines(1)
      val clazz: String = lines(4)
      (id, (name, clazz))
    })
    val joinRDD: RDD[(String, ((String, String), Int))] = resultRDD.join(sumscoreInfo)
    joinRDD.map {
      case (id: String, ((name: String, clazz: String), score: Int)) => (id, name, clazz, score)
    }.foreach(println)
  }
}

3、统计每科都及格的学生 [学号,姓名,班级,科目,分数]

object test3 {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
    val sc = new SparkContext(conf)
    val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
    val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")

    /**
      * 3、统计每科都及格的学生 [学号,姓名,班级,科目,分数]
      */
    val filterRDD = scoreRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val score: Int = lines(2).toInt
      (id, score)
    }).filter(kv => kv._2 > 60)

    val groupRDD: RDD[(String, Iterable[(String, Int)])] = filterRDD.groupBy(kv => kv._1)
    val ids: Array[String] = groupRDD.filter(line => line._2.size == 6).map(line => line._1).collect()

    val resultRDD = stuRDD.filter(line => ids.contains(line.split(",")(0)))
      .map(line => {
        val lines: Array[String] = line.split(",")
        val id: String = lines(0)
        val name: String = lines(1)
        val clazz: String = lines(4)
        (id, (name, clazz))
      })

    val resultRDD2 = scoreRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val subject: String = lines(1)
      val score: String = lines(2)
      (id, (subject, score))
    })
    val joinRDD: RDD[(String, ((String, String), Option[(String, String)]))] = resultRDD.leftOuterJoin(resultRDD2)
    joinRDD.map {
      case (id: String, ((name: String, clazz: String), Some((subject: String, score: String)))) => (id, name, clazz, subject, score)
    }.foreach(println)
  }
}

4、统计偏科最严重的前100名学生 [学号,姓名,班级,科目,分数]

object test4 {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("sql").setMaster("local")
    val sc = new SparkContext(conf)
    val scoreRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\score.txt")
    val stuRDD: RDD[String] = sc.textFile("D:\\bigdataPlus\\Spark\\src\\main\\java\\com\\shujia\\spark\\Day3_test_spark\\data\\students.txt")

    /**
      * 4、统计偏科最严重的前100名学生  [学号,姓名,班级,科目,分数]  如何求方差
      */
    val scoreInfo = scoreRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val score = lines(2).toInt
      (id, score)
    })
    val groupRDD = scoreInfo.groupByKey
    val covRDD = groupRDD.map(line => {
      val id: String = line._1
      val scores: Iterable[Int] = line._2
      val avg_score: Int = scores.sum / 6
      val cov = scores.map(line => {
        (line - avg_score) * (line - avg_score)
      }).sum / 6
      (id, cov)
    })

    val ids: Array[String] = covRDD.sortBy(kv => kv._2, false).take(100).map(kv => kv._1)

    val filterRDD = stuRDD.filter(line => {
      val id: String = line.split(",")(0)
      ids.contains(id)
    })
    val resultRDD = filterRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val name: String = lines(1)
      val clazz: String = lines(4)
      (id, (name, clazz))
    })
    val rsRDD = scoreRDD.map(line => {
      val lines: Array[String] = line.split(",")
      val id: String = lines(0)
      val subject: String = lines(1)
      val score: Int = lines(2).toInt
      (id, (subject, score))
    })
    val joinRDD: RDD[(String, ((String, String), Option[(String, Int)]))] = resultRDD.leftOuterJoin(rsRDD)
    joinRDD.map {
      case (id: String, ((name: String, clazz: String), Some((subject: String, score: Int)))) => (id, name, clazz, subject, score)
    }.foreach(println)
  }
}

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值