SparkDay02

最新推荐文章于 2024-08-08 21:10:23 发布

刘浩浩yyds

最新推荐文章于 2024-08-08 21:10:23 发布

阅读量276

点赞数

分类专栏： spark 文章标签： scala spark big data

本文链接：https://blog.csdn.net/weixin_65996579/article/details/124852830

版权

spark 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

spark

一、Union合并两个RDD
二、Join
三、MapValues
四、Sort
五、Distinct
七、RDDAction算子
八、StudentTest统计偏科最严重的前100名学生
九、Submit在集群中运行
十、StudentTest统计每科都及格的学生
十一、用代码求PI
十二、Acc累加器
十三、Bro广播函数

一、Union合并两个RDD

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo13Union {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("union")
    val sc = new SparkContext(conf)

    val rdd1: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5, 6))
    val rdd2: RDD[Int] = sc.parallelize(List(3, 4, 5, 6, 7, 8, 9))

    println(s"rdd1:${rdd1.getNumPartitions}")
    println(s"rdd2:${rdd2.getNumPartitions}")

    /**
     *
     * union：合并两个RDD 不会对数据做去重 两个RDD的类型完全一致
     * 在物理层面并没有合并 只是在逻辑层面合并了
     *
     */

    val unionRDD: RDD[Int] = rdd1.union(rdd2)

    println(s"unionRDD:${unionRDD.getNumPartitions}")

    unionRDD.foreach(println)
  }

}

二、Join

1、innerJoin

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo14Join {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setAppName("union")
    conf.setMaster("local")
    val sc = new SparkContext(conf)

    val idNameRDD: RDD[(String, String)] = sc.parallelize(List(
      ("001", "小伟"),
      ("002", "张三"),
      ("003", "刘思思"),
      ("004", "王五")
    ))

    val idAgeRDD: RDD[(String, String)] = sc.parallelize(List(
      ("001", "23"),
      ("002", "24"),
      ("003", "25"),
      ("004", "23")
    ))

    /**
     * innerJoin：两个表都有才能关联上
     *
     */

    val innerJoinRDD: RDD[(String, (String, String))] = idNameRDD.join(idAgeRDD)

    //整理数据
    innerJoinRDD
      .map {
        case (id: String, (name: String, age: String)) =>
          (id, name, age.toInt)
      }
      .foreach(println)

2、leftOuterJoin

 /**
     * leftOuterJoin：以左表为基础 如果右表没有这个Key 补None
     *
     * Option：可选的值 有值或者没有值
     *
     */

    val leftJoinRDD: RDD[(String, (String, Option[String]))] = idNameRDD.leftOuterJoin(idAgeRDD)

    //整理数据
    leftJoinRDD
      .map {
        //关联上的处理方式
        case (id: String, (name: String, Some(age))) =>
          (id, name, age.toInt)

        //没有关联上的处理方式
        case (id: String, (name: String, None)) =>
          (id, name, 0)
      }
      .foreach(println)

3、fullOuterJoin

/**
     * fullOuterJoin：以两个表为基础 有一边有数据就会出来结果 另一边补None
     *
     */

    val fullJoinRDD: RDD[(String, (Option[String], Option[String]))] = idNameRDD.fullOuterJoin(idAgeRDD)

    fullJoinRDD
      .map {
        case (id: String, (Some(name), None)) =>
          (id, name, 0)

        case (id: String, (None, Some(age))) =>
          (id, "默认值", age)

        case (id:String,(Some(name),Some(age)))=>
          (id,name,age)
      }
      .foreach(println)

  }
}

三、MapValues

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo15MapValues {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()

    conf.setMaster("local")
    conf.setAppName("union")

    val sc = new SparkContext(conf)

    val idAgeRDD: RDD[(String, Int)] = sc.parallelize(List(
      ("001", 23),
      ("002", 24),
      ("003", 25),
      ("004", 23)
    ))

    /**
     * mapValues：key不变 对value做处理
     *
     */

    val rdd: RDD[(String, Int)] = idAgeRDD.mapValues(age => age + 1)

    rdd.foreach(println)
  }

}

四、Sort

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo16Sort {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()

    conf.setMaster("local")
    conf.setAppName("group")

    val sc = new SparkContext(conf)

    val linesRDD: RDD[String] = sc.textFile("data/score.txt")

    //取出学号和分数
    val scoreRDD: RDD[(String, Int)] = linesRDD
      .map(line => line.split(","))
      .filter(arr => arr.length == 3)
      .map {
        case Array(id: String, _: String, sco: String) =>
          (id, sco.toInt)
      }

    //计算总分
    val sumScoreRDD: RDD[(String, Int)] = scoreRDD.reduceByKey(_ + _)

    /**
     * sortBy：指定一个字段进行排序
     * sortByKey：通过Key进行排序
     * 默认是升序
     */

    val sortByRDD: RDD[(String, Int)] = sumScoreRDD.sortBy(kv => kv._2, ascending = false)

    sortByRDD.foreach(println)
  }

}

五、Distinct

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo17Distinct {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()

    conf.setAppName("group")
    conf.setMaster("local")

    val sc = new SparkContext(conf)

    val rdd1: RDD[Int] = sc.parallelize(List(1, 1, 12, 5, 2, 2, 52, 41, 5, 3, 32, 4, 5))

    /**
     * distinct：对数据去重 会产生shuffle
     *
     */

    val distinctRDD: RDD[Int] = rdd1.distinct()

    distinctRDD.foreach(println)
  }

}

七、RDDAction算子

package com.shujia.spark.core

import com.shujia.spark.util.HDFSUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo18Action {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()

    conf.setMaster("local")
    conf.setAppName("group")

    val sc = new SparkContext(conf)

    val studentRDD: RDD[String] = sc.textFile("data/students.txt")

    /**
     * action算子：每一个action算子都会产生一个job
     *
     */

    /**
     * count：统计RDD的数据行数
     *
     */
    val count: Long = studentRDD.count()

    println(count)


    /**
     *sum：对RDD中的数据求和 RDD中的数据类型必须是数字
     */

    val sumAge: Double = studentRDD
      .map(line => line.split(",")(2).toInt)
      //对所有数据求和 只能是数字类型
      .sum()

    println(sumAge /count)

    /**
     * take：取top 返回下一个数组 不能取太多 会导致内存溢出
     *
     */

    val top10: Array[String] = studentRDD.take(10)

    top10.foreach(println)

    /**
     * collect：将RDD转换成数组 如果RDD数据量比较大 会导致内存溢出 （1G）
     *
     */

    val array: Array[String] = studentRDD.collect()

    array.foreach(println)

    /**
     * foreach：遍历RDD中的数据 也是一个action算子
     * foreachPartition：一次讲一个分区的数据传递给后面的函数
     *
     */
    studentRDD.foreach(println)

    studentRDD.foreachPartition((iter:Iterator[String])=>{
      iter.foreach(println)
    })

    /**
     * saveAsTextFile：将数据保存到HDFS中
     *
     */

    HDFSUtil.deletePath("data/test")

    studentRDD.saveAsTextFile("data/test")

    while (true){

    }
  }

}

八、StudentTest统计偏科最严重的前100名学生

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo19Student {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()

    conf.setAppName("student")
    conf.setMaster("local")

    val sc = new SparkContext(conf)

    /**
     * 4、统计偏科最严重的前100名学生
     *
     * 通过方差评估偏科程度
     *
     * 1、计算方差
     * 2、计算每个人多个科目的平均值
     *
     */

    //1、读取数据
    val scoresRDD: RDD[String] = sc.textFile("data/score.txt")

    //整理数据
    val kvRDD: RDD[(String, String, Double)] = scoresRDD
      //拆分数据
      .map(_.split(","))
      //过滤脏数据
      .filter(_.length == 3)
      .map {
        case Array(sId: String, cId: String, sco: String) =>
          (sId, cId, sco.toDouble)
      }

    //3、按照学号进行分组
    val groupByRDD: RDD[(String, Iterable[(String, String, Double)])] = kvRDD.groupBy(_._1) //(line => line._1)

    //计算方差
    val stdRDD: RDD[(String, Double)] = groupByRDD.map {
      case (id: String, iter: Iterable[(String, String, Double)]) =>
        //取出分数
        val scores: List[Double] = iter.map(_._3).toList //(line =>line._3)

        /**
         * 计算方差
         * 1、计算平均值
         * 2、套公式算方差
         *
         */
        val avg: Double = scores.sum / scores.length

        val std: Double = scores.map(i => (i - avg) * (i - avg)).sum / scores.length

        (id, std)
    }

    //按照方差排序
    val sortByRDD: RDD[(String, Double)] = stdRDD.sortBy(_._2, ascending = false)

    //偏科最严重的前100名学生
    val top100: Array[(String, Double)] = sortByRDD.take(100)

    top100.foreach(println)
  }

}

九、Submit在集群中运行

package com.shujia.spark.core

import com.shujia.spark.util.HDFSUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo20Submit {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()

    //提交到集群中运行则需要先删除master
//    conf.setMaster("local")
    conf.setAppName("submit")

    val sc = new SparkContext(conf)

    //读取HDFS中的数据
    val studentRDD: RDD[String] = sc.textFile("/data/students.txt")

    //读取班级
    val kvRDD: RDD[(String, Int)] = studentRDD
      .map(line => {
        val split: Array[String] = line.split(",")
        (split(4), 1)
      })

    //统计班级的人数
    val classNumRDD: RDD[(String, Int)] = kvRDD.reduceByKey(_ + _)

    //整理数据
    val resultRDD: RDD[String] = classNumRDD
      .map {
        case (clazz: String, num: Int) =>
          s"$clazz\t$num"
      }

    //删除输出路径
    HDFSUtil.deletePath("/data/clazz_num")
    //将数据包到HDFS中
    resultRDD.saveAsTextFile("/data/clazz_num")


    /**
     * 将项目打包上传到集群上
     *
     * spark-submit --class com.shujia.spark.core.Demo20Submit --master yarn-client spark-1.0.jar
     */
  }

}

十、StudentTest统计每科都及格的学生

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo21Student {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setAppName("student")
    conf.setMaster("local")
    val sc = new SparkContext(conf)

    /**
     * 3、统计每科都及格的学生
     *
     */

    //1、读取分数表
    val scoreRDD: RDD[String] = sc.textFile("data/score.txt")

    //2、读取科目表
    val subjectRDD: RDD[String] = sc.textFile("data/subject.txt")

    //将数据转换为kv格式
    val scoreKVRDD: RDD[(String, (String, Int))] = scoreRDD
      .map(_.split(","))
      .filter(_.length == 3)
      .map {
        case Array(sid: String, cid: String, sco: String) =>
          (cid, (sid, sco.toInt))
      }

    val subjectKVRDD: RDD[(String, Int)] = subjectRDD
      .map(_.split(","))
      .filter(_.length == 3)
      .map {
        case Array(cid: String, _: String, sumSco: String) =>
          (cid, sumSco.toInt)
      }

    //关联学生表和分数表
    val joinRDD: RDD[(String, (Int, (String, Int)))] = subjectKVRDD.join(scoreKVRDD)

    //整理数据
    val comRDD: RDD[(String, String, Int, Int)] = joinRDD.map {
      case (cid: String, (sumSco: Int, (sid: String, sco: Int))) =>
        (sid, cid, sumSco, sco)
    }

    //取出所有及格的分数
    val filterRDD: RDD[(String, String, Int, Int)] = comRDD.filter {
      case (_: String, _: String, sumSco: Int, sco: Int) =>
        sco >= sumSco * 0.6
    }

    //统计每个学生及格的科目数量
    val groupByRDD: RDD[(String, Iterable[(String, String, Int, Int)])] = filterRDD.groupBy(_._1)
    val resultRDD: RDD[(String, Iterable[(String, String, Int, Int)])] = groupByRDD.filter(_._2.size == 6)

    //将数据展开
    val doujigeRDD: RDD[(String, String, Int, Int)] = resultRDD.flatMap(kv => kv._2)

    doujigeRDD.foreach(println)
  }

}

十一、用代码求PI

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.util.Random

object Demo22PI {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setAppName("pi")
    conf.setMaster("local[8]")
    val sc = new SparkContext(conf)

    val list: Range = 0 until 1000000

    //总数
    val length: Int = list.length

    //1、生成一个包含很多行的一个RDD
    val listRDD: RDD[Int] = sc.parallelize(list, 8)

    //2、循环随机生成点 随机生成正方形内的点
    val pointsRDD: RDD[(Double, Double)] = listRDD.map(i => {
      //生成-1到1的随机数
      val x: Double = Random.nextDouble() * 2 - 1
      val y: Double = Random.nextDouble() * 2 - 1
      (x, y)
    })

    //取出圆内的点
    val yuanPointRDD: RDD[(Double, Double)] = pointsRDD.filter {
      case (x: Double, y: Double) =>
        (x * x + y * y) < 1
    }

    //套公式
    val PI: Double = yuanPointRDD.count().toDouble / length * 4.0

    println(s"PI is :$PI")
  }

}

十二、Acc累加器

package com.shujia.spark.core

import org.apache.spark.rdd.RDD
import org.apache.spark.util.LongAccumulator
import org.apache.spark.{SparkConf, SparkContext}

object Demo23Acc {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setAppName("acc")
    conf.setMaster("local")
    val sc = new SparkContext(conf)

    val studentRDD: RDD[String] = sc.textFile("data/students.txt",2)

    //算子的代码运行在Driver端
    var count = 0

    studentRDD.foreach(stu=>{
      //算子内的代码运行在Executor端
      //在Spark写代码的时候不能在算子内取修改算子外的一个普通变量
      //就算修改了 在算子外也不会生效
      count+=1
      println(count)
    })

    println(count)

    /**
     *
     * 累加器
     *
     */

    //1、自定义累加器
    val countAcc: LongAccumulator = sc.longAccumulator

    studentRDD.foreach(stu =>{

      //2、在算子内对累加器进行累加
      countAcc.add(1)
    })

    //3、在算子外获取累加的结果
    println(countAcc.value)

    while (true){

    }
  }

}

十三、Bro广播函数

package com.shujia.spark.core

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo24Bro {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setAppName("bro")
    conf.setMaster("local")
    val sc = new SparkContext(conf)

    val studentRDD: RDD[String] = sc.textFile("data/students.txt")

    val scoreRDD: RDD[String] = sc.textFile("data/score.txt")

    //在算子内不能使用其他的RDD来写代码
    //    scoreRDD.foreach(sco=>{
    //      studentRDD.foreach(println)
    //    })

    //将RDD的数据拉取到Driver端，构建成一个数组
    val studentArray: Array[String] = studentRDD.collect()
    //转换为kv格式
    val kvStuRDD: Array[(String, String)] = studentArray.map(stu => {
      val split: Array[String] = stu.split(",")
      val id: String = split(0)
      (id, stu)
    })

    //转换map集合
    val stuMap: Map[String, String] = kvStuRDD.toMap

    //将一个普通的变量广播出去
    val stuMapBro: Broadcast[Map[String, String]] = sc.broadcast(stuMap)

    /**
     * mapjoin：在map端进行表关联 不会产生shuffle
     * 当一个大表和一个小表进行关联的时候可以使用mapjoin
     *
     */

    val joinRDD: RDD[(String, String)] = scoreRDD.map(sco => {
      val split: Array[String] = sco.split(",")
      val id: String = split(0)
      //使用id到学生表的map集合中获取学生的信息
      //在算子内获取广播变量的值
      val broValue: Map[String, String] = stuMapBro.value
      val stuInfo: String = broValue.getOrElse(id, "默认值")
      (stuInfo, sco)
    })

    joinRDD.foreach(println)

    while (true) {

    }
  }

}

刘浩浩yyds

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
SparkDay02

一、Union合并两个RDD二、Join1、innerJoin2、leftOuterJoin3、fullOuterJoin三、MapValues四、Sort五、Distinct七、RDDAction算子八、StudentTest统计偏科最严重的前100名学生九、Submit在集群中运行十、StudentTest统计每科都及格的学生十一、用代码求PI十二、Acc累加器十三、Bro广播函数
复制链接

扫一扫