spark常见算子总结

最新推荐文章于 2024-07-14 08:39:07 发布

GYY22897

最新推荐文章于 2024-07-14 08:39:07 发布

阅读量148

点赞数

文章标签： scala 大数据 java

原文链接：http://www.cnblogs.com/shi-qi/articles/10533966.html

版权

RDD编程API:
    1,RDD中的所有转换(Transformation)都是延迟加载的，也就是说，它们并不会直接计算结果,只是记住这些应用到基础数据集（例如一个文件）上的转换动作。只有当发生一个要求返回结果给Driver的动作时，这些转换才会真正运行
spark 常见算子总结(其一)

package day02
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.prop.Tables.Table
object RddTest {
  def main(args: Array[String]): Unit = {
//    map()
    // 只能跑一个 RDD
//    filter()
//    flatmap()
//    groupbykey()
//    reducebykey()
//    sortByKey()
    join()
  }
  def map(): Unit = {
    // 将集合之中的每一个元素乘以2

    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val number = Array(1, 2, 3, 4, 5)
    val numberRDD = sc.parallelize(number, 1)
    // map 接受的是 function对象！
    // 每一个映射成 num * 2
    val multipmentRdd = numberRDD.map { num => num * 2 }
    // 循环打印出每一个 num
    multipmentRdd.foreach { num => println(num) }
  }

  def filter(): Unit = {
    // filter 过滤操作！
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val number = Array(1, 2, 3, 4, 5)
    val numberRDD = sc.parallelize(number, 1)
    // {} 与 () 相同！
    //过滤集合之中的偶数！filter方法遍历整个集合, 参数是函数(是一种判断)！
    val evennumRDD = numberRDD.filter { num => num % 2 == 0 }
    evennumRDD.foreach { num => println(num) }
  }

  def flatmap(): Unit = {
    // 将文本行拆分成多个单词！
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val lineArry = Array("hello java", "hello python", "hello R", "hello you")
    val lines = sc.parallelize(lineArry, 1)
    val words = lines.flatMap { lines => lines.split(" ") }
    words.foreach { words => println(words) }
  }

  def groupbykey(): Unit = {
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val corelist = Array(
      Tuple2("class1", 34), Tuple2("class2", 26),
      Tuple2("class1", 69), Tuple2("class2", 87)
    )
    val cores = sc.parallelize(corelist,1)
    val groupscore = cores.groupByKey()
    groupscore.foreach {score => println(score._1);score._2.foreach(sing=>println(sing))}
     /*
    class1 34  69
    class2 26 87
      */
  }
  def reducebykey(): Unit ={
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val corelist = Array(
      Tuple2("class1", 34), Tuple2("class2", 26),
      Tuple2("class1", 69), Tuple2("class2", 87)
    )
    val scores = sc.parallelize(corelist,1)
    // 对于相同的 key 进行处理，最终每一条key 保留一条记录！
    val totalScore =scores.reduceByKey(_+_)
    totalScore.foreach(classScore =>println(classScore._1+" "+classScore._2))
    /*class1 103   class2 113*/
  }
  def sortByKey(): Unit ={
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val sortlist = Array(
      Tuple2(3,"xiaoming"),Tuple2(113,"xiaoqiang"),Tuple2(132,"xiaolv"),Tuple2(43,"xiaoxiao")
    )
    val scores = sc.parallelize(sortlist,1)
    val sortedScore = scores.sortByKey()
    sortedScore.foreach{
      sortedScore =>println(sortedScore._1+" "+sortedScore._2)
    }
    /*
      3 xiaoming
      43 xiaoxiao
      113 xiaoqiang
      132 xiaolv
    * */
  }
  def join(): Unit = {
    // 两个 rdd 关联起来 join 打印每一个学生的成绩！
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val studentList = Array(
      Tuple2(1,"loe"),
      Tuple2(2,"jiek"),
      Tuple2(3,"tom")
    )
    val scoreList = Array(
      Tuple2(1,23),
      Tuple2(2,35),
      Tuple2(3,24)
    )
    val students = sc.parallelize(studentList)
    val scores = sc.parallelize(scoreList)
    val studentsScores = students.join(scores)
    studentsScores.foreach(
      studentsScores =>{ println("student id: " +studentsScores._1);
      println("student name: "+ studentsScores._2);
        println("student name: "+ studentsScores._2);
        println("-------------------------------")
      }
    )
    /*
   student id: 1
student name: (loe,23)
student name: (loe,23)
-------------------------------
student id: 3
student name: (tom,24)
student name: (tom,24)
-------------------------------
student id: 2
student name: (jiek,35)
student name: (jiek,35)
-------------------------------
   * */
  }
}

常见的 action操作！

package day02

import org.apache.spark.{SparkConf, SparkContext}

object ActionRDD {
  def main(args: Array[String]): Unit = {
//    reduce()
//    countBykey()
//    collect()
    take()
  }
  def reduce(): Unit ={
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val numberArry = Array(1,2,3,4,5,6,7,8,9,10)
    val numbers = sc.parallelize(numberArry,1)
    val sum = numbers.reduce(_+_) //进行累加
    println(sum)
  }
  def collect(): Unit ={
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val numberArry = Array(1,2,3,4,5,6,7,8,9,10)
    val numbers = sc.parallelize(numberArry,1)
    val doubleNumbes = numbers.map( num => num * 2)
    println(doubleNumbes)//MapPartitionsRDD[1] at map at ActionRDD.scala:24
    println("------------------")
    // 使用 collect操作时将分布式集群之上的 doubleNumbes RDD的数据拉取到本地之中,
    // 通常使用 foreach action 操作进行对RDD处理
    val doubleNumerArry =doubleNumbes.collect()
    for (num <- doubleNumerArry){
//      println(num)  //2 4 6 8 10 12 14 16 18 20
    }
  }
  def countBykey(): Unit ={
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val studentlist = Array(
      Tuple2("class1", "jiek"), Tuple2("class2", "tpm"),
      Tuple2("class1", "root"), Tuple2("class2", "user")
    )
    val students = sc.parallelize(studentlist,1)
    // 计算 key个数
    val studentconut = students.countByKey()
    println(studentconut)
    println("------------------------")
  }

  def take(): Unit ={
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val numberArry = Array(1,2,3,4,5,6,7,8,9,10)
    val numbers = sc.parallelize(numberArry,1)
    val doubleNumbes = numbers.map( num => num * 2)
    // take 排序 取出前三个元素
    val top3Nums =doubleNumbes.take(3)
    for (num <- top3Nums){
      println(num)  // 2 4 6
    }
  }
}

join 算子操作总结

package day02

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object joinOperation {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("SparkRDD").setMaster("local")
    val sc = new SparkContext(conf)
    val namelist = Array(Tuple2(1,"xiao"),Tuple2(2,"cww"),Tuple2(3,"wd"),Tuple2(4,"wd"))
    val scorelist = Array(Tuple2(1,123),Tuple2(2,34),Tuple2(3,87))
    // makeRDD 将本地集合转换成 RDD, 3代表着生成三个分区
    //parallelize 同上都是生成 RDD第一步 parallelize 接受的是 序列, 生成器与索引拼接
    // RDD[(Int:NameRDD]代表着key类型,String:代表着 NameRDD的value类型)]
    val NameRDD:RDD[(Int,String)] = sc.makeRDD(namelist,3)
    val ScoreRDD =sc.parallelize(scorelist,1)
//    RDD[(Int:代表ScoreRDD key值,(Int:NameRDD key值,String NameRDD value值))]
    val resultRDD:RDD[(Int,(Int,String))] = ScoreRDD.join(NameRDD)// join 内连接 Tuple2(4,"wd")关联不了
    // leftOuterJoin 是按照左边 RDD的内容作为标准
    val leftOuterJoinResultEDD = NameRDD.leftOuterJoin(ScoreRDD)
    resultRDD.foreachPartition((x =>{
        while (x.hasNext){
          val log = x.next
          val id =log._1
          val name = log._2._2
          val core = log._2._1
          println("id: "+id +"\t name:"+name+"\t core:"+core)
          /*
          id: 1     core:123     name:xiao
          id: 2     core:34     name:cww
          id: 3     core:87     name:wd
          * */
        }
    }))
    leftOuterJoinResultEDD.foreachPartition((x =>{
      while (x.hasNext){
        val log = x.next
        val id =log._1
        val namr = log._2._1
        val core = log._2._2
        println("id: "+id +"\t name:"+namr+"\t core:"+core)
      /*
      id: 3     name:wd     core:Some(87)
      id: 4     name:wd     core:None
      id: 1     name:xiao     core:Some(123)
      id: 2     name:cww     core:Some(34)
       */
      }
    }))
  }
}

常见的action操作

package day02

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ListBuffer

object Transformationses {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("test")
    val sc = new SparkContext(conf)
    val arr = Array(
      "ABC", "Abc1","Abc10")
    val rdds = sc.parallelize(arr,3)
    val rdd = sc.parallelize(arr, 3)
    val rdd1 = rdds.mapPartitionsWithIndex((index, iter) => {
      val list = new ListBuffer[String]()
      while (iter.hasNext) {
        list.+=("rdds partition index = " + index + ",value = " + iter.next())
      }
      list.iterator // 返回一个迭代对象！
    }, true)
        rdd1.foreach(println)
    /*(ABC,0) (Abc1,1) (Abc10,2)*/
        println("--------------------")
        rdd.zipWithIndex().foreach(println)
    /*
    * (ABC,1)
      (Abc1,1)
      (Abc10,1)
    * */
//    rdd.zip(rdds).foreach(println)
    /*
    (ABC,ABC)
    (Abc1,Abc1)
    (Abc10,Abc10)
    * */
    rdd.countByValue().foreach(println)
    /*
    * (ABC,1)
      (Abc1,1)
      (Abc10,1)
    * */
    sc.stop()
  }
}

maven 编译 jar包 集群提交任务

sudo ./bin/spark-submit --class day02.ActionRDD --executor-memory 20M --executor-cores 1 /home/hadoop/spark-1.4.0-bin-hadoop2.3/lib/sfd-1.0-SNAPSHOT.jar

转载于:https://www.cnblogs.com/shi-qi/articles/10533966.html

GYY22897

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark常见算子总结

RDD编程API: 1,RDD中的所有转换(Transformation)都是延迟加载的，也就是说，它们并不会直接计算结果,只是记住这些应用到基础数据集（例如一个文件）上的转换动作。只有当发生一个要求返回结果给Driver的动作时，这些转换才会真正运行spark 常见算子总结(其一)package day02import org.apache.spark.{SparkC...
复制链接

扫一扫