Spark常用算子

spark有些常用的算子我们要知道,这些可以让我们在工作中得心应手,我们一起来看看吧

package com.wy

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object OperatorDemo {

  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local").setAppName("OperatorDemo")
    val sc = new SparkContext(conf)

    val numRDD: RDD[Int] = sc.parallelize(List(1,2,4,5,6,3))
    val wordRdd: RDD[String] = sc.parallelize(List("a=a","b=b","c=c","e=e","d","f","e=e"))
    val tuple2RDD: RDD[(String, Int)] = sc.parallelize(List(("a",2),("a",3),("b",2),("a",5),("b",4)))
    val tuple2RDD2: RDD[(String, Int)] = sc.parallelize(List(("a",6),("a",7)))

    //map
    val mapRDD: RDD[Array[String]] = wordRdd.map(one=>{one.split("=")})
    val mapRDD2: RDD[String] = wordRdd.map(one=>one)
    //a=a: Array("a","a")
    //b=b: Array("b,"b")
	
    //flatmap
    val flatmapRDD: RDD[String] = wordRdd.flatMap(one=>{one.split("=")})
    //a=a: Array("a","a")
    //b=b: Array("b,"b")
    //扁平化,打散返回:"a","a","b","b"

    mapRDD.foreach(println(_))
    flatmapRDD.foreach(println(_))

    //distinct
    val distinctRDD: RDD[String] = wordRdd.distinct()
    distinctRDD.foreach(println(_))

    //filter
    val filterRDD: RDD[String] = wordRdd.filter(one=>{one.contains("=")})
    println("==================="+filterRDD+"=======================") //不能这样打印
    filterRDD.foreach(println(_))

    //groupbykey:只分组不聚合
    val groupbykeyRDD: RDD[(String, Iterable[Int])] = tuple2RDD.groupByKey()
    //key:"a"   (2,3,5)
    //key:"b"   (2,4)
    //reducebykey:分组后聚合
    val reducebykeyRDD: RDD[(String, Int)] = tuple2RDD.reduceByKey(_+_)
    //key:"a"  10
    //key:"b"  6

    groupbykeyRDD.foreach(println(_))
    reducebykeyRDD.foreach(println(_))

    //sortByKey
    val reducebykeyRDD2: RDD[(Int, String)] = reducebykeyRDD.map(tuple=>(tuple._2,tuple._1))
    val sortByKeyRDD: RDD[(Int, String)] = reducebykeyRDD2.sortByKey()
    val sortByKeyRDD2: RDD[(String, Int)] = sortByKeyRDD.map(tuple=>(tuple._2,tuple._1))
    sortByKeyRDD2.foreach(println(_))

    //join
    /*(a,(2,6))
    (a,(2,7))
    (a,(3,6))
    (a,(3,7))
    (a,(5,6))
    (a,(5,7))*/
    val joinRDD1: RDD[(String, (Int, Int))] = tuple2RDD.join(tuple2RDD2)
    joinRDD1.foreach(println(_))
    val leftJoinRDD: RDD[(String, (Int, Option[Int]))] = tuple2RDD.leftOuterJoin(tuple2RDD2)
    leftJoinRDD.foreach(println(_))

    //union
    val unionRDD: RDD[(String, Int)] = tuple2RDD.union(tuple2RDD2)
    unionRDD.foreach(println(_))

    val mapPartitionRDD: RDD[String] = wordRdd.mapPartitions(it => {
      var list = List[String]()
      while (it.hasNext) {
        val str: String = it.next()
        val strarray: Array[String] = str.split("=")
        list = list:::strarray.toList
      }
      list.iterator
    })

    mapPartitionRDD.foreach(println(_))
  }


}

package com.bawei.sparkcore

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object OperatorDemo2 {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local").setAppName("OperatorDemo")
    val sc = new SparkContext(conf)

    val numRDD: RDD[Int] = sc.parallelize(List(1,2,4,5,6,3))
    val wordRdd: RDD[String] = sc.parallelize(List("a=a","b=b","c=c","e=e","d","f","e=e"))
    val tuple2RDD: RDD[(String, Int)] = sc.parallelize(List(("a",2),("a",3),("b",2),("a",5),("b",4)))
    val tuple2RDD2: RDD[(String, Int)] = sc.parallelize(List(("a",6),("a",7)))

    //行动算子:foreach
    //略,foreach有foreachPartitionRDD方法和mapPartitionRDD是同类算子
	
    //行动算子:countByKey
    val resultMap: collection.Map[String, Long] = tuple2RDD.countByKey()
    for((k,v) <- resultMap){
      println(k+"元素对应的个数是"+v)
    }
    //行动算子:count
    val numlength: Long = numRDD.count()
    println("numlength的值是:"+numlength)
    //行动算子:first
    val first: Int = numRDD.first()
    println("first对应的值是:"+first)
    //行动算子:take
    val take: Array[Int] = numRDD.take(3)
    for(num <- take){
      println("take的结果是:"+num)
    }
    //行动算子:reduce
    val result: Int = numRDD.reduce(_+_)
    println("result的结果是:"+result)
    //行动算子:collect
    val collectResult: Array[Int] = numRDD.collect() //在驱动程序中将数据集的所有元素作为数组返回。
    //行动算子:saveAsTextFile
    numRDD.saveAsTextFile("oper-result")


  }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值