spark有些常用的算子我们要知道,这些可以让我们在工作中得心应手,我们一起来看看吧
package com.wy
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object OperatorDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("OperatorDemo")
val sc = new SparkContext(conf)
val numRDD: RDD[Int] = sc.parallelize(List(1,2,4,5,6,3))
val wordRdd: RDD[String] = sc.parallelize(List("a=a","b=b","c=c","e=e","d","f","e=e"))
val tuple2RDD: RDD[(String, Int)] = sc.parallelize(List(("a",2),("a",3),("b",2),("a",5),("b",4)))
val tuple2RDD2: RDD[(String, Int)] = sc.parallelize(List(("a",6),("a",7)))
//map
val mapRDD: RDD[Array[String]] = wordRdd.map(one=>{one.split("=")})
val mapRDD2: RDD[String] = wordRdd.map(one=>one)
//a=a: Array("a","a")
//b=b: Array("b,"b")
//flatmap
val flatmapRDD: RDD[String] = wordRdd.flatMap(one=>{one.split("=")})
//a=a: Array("a","a")
//b=b: Array("b,"b")
//扁平化,打散返回:"a","a","b","b"
mapRDD.foreach(println(_))
flatmapRDD.foreach(println(_))
//distinct
val distinctRDD: RDD[String] = wordRdd.distinct()
distinctRDD.foreach(println(_))
//filter
val filterRDD: RDD[String] = wordRdd.filter(one=>{one.contains("=")})
println("==================="+filterRDD+"=======================") //不能这样打印
filterRDD.foreach(println(_))
//groupbykey:只分组不聚合
val groupbykeyRDD: RDD[(String, Iterable[Int])] = tuple2RDD.groupByKey()
//key:"a" (2,3,5)
//key:"b" (2,4)
//reducebykey:分组后聚合
val reducebykeyRDD: RDD[(String, Int)] = tuple2RDD.reduceByKey(_+_)
//key:"a" 10
//key:"b" 6
groupbykeyRDD.foreach(println(_))
reducebykeyRDD.foreach(println(_))
//sortByKey
val reducebykeyRDD2: RDD[(Int, String)] = reducebykeyRDD.map(tuple=>(tuple._2,tuple._1))
val sortByKeyRDD: RDD[(Int, String)] = reducebykeyRDD2.sortByKey()
val sortByKeyRDD2: RDD[(String, Int)] = sortByKeyRDD.map(tuple=>(tuple._2,tuple._1))
sortByKeyRDD2.foreach(println(_))
//join
/*(a,(2,6))
(a,(2,7))
(a,(3,6))
(a,(3,7))
(a,(5,6))
(a,(5,7))*/
val joinRDD1: RDD[(String, (Int, Int))] = tuple2RDD.join(tuple2RDD2)
joinRDD1.foreach(println(_))
val leftJoinRDD: RDD[(String, (Int, Option[Int]))] = tuple2RDD.leftOuterJoin(tuple2RDD2)
leftJoinRDD.foreach(println(_))
//union
val unionRDD: RDD[(String, Int)] = tuple2RDD.union(tuple2RDD2)
unionRDD.foreach(println(_))
val mapPartitionRDD: RDD[String] = wordRdd.mapPartitions(it => {
var list = List[String]()
while (it.hasNext) {
val str: String = it.next()
val strarray: Array[String] = str.split("=")
list = list:::strarray.toList
}
list.iterator
})
mapPartitionRDD.foreach(println(_))
}
}
package com.bawei.sparkcore
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object OperatorDemo2 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("OperatorDemo")
val sc = new SparkContext(conf)
val numRDD: RDD[Int] = sc.parallelize(List(1,2,4,5,6,3))
val wordRdd: RDD[String] = sc.parallelize(List("a=a","b=b","c=c","e=e","d","f","e=e"))
val tuple2RDD: RDD[(String, Int)] = sc.parallelize(List(("a",2),("a",3),("b",2),("a",5),("b",4)))
val tuple2RDD2: RDD[(String, Int)] = sc.parallelize(List(("a",6),("a",7)))
//行动算子:foreach
//略,foreach有foreachPartitionRDD方法和mapPartitionRDD是同类算子
//行动算子:countByKey
val resultMap: collection.Map[String, Long] = tuple2RDD.countByKey()
for((k,v) <- resultMap){
println(k+"元素对应的个数是"+v)
}
//行动算子:count
val numlength: Long = numRDD.count()
println("numlength的值是:"+numlength)
//行动算子:first
val first: Int = numRDD.first()
println("first对应的值是:"+first)
//行动算子:take
val take: Array[Int] = numRDD.take(3)
for(num <- take){
println("take的结果是:"+num)
}
//行动算子:reduce
val result: Int = numRDD.reduce(_+_)
println("result的结果是:"+result)
//行动算子:collect
val collectResult: Array[Int] = numRDD.collect() //在驱动程序中将数据集的所有元素作为数组返回。
//行动算子:saveAsTextFile
numRDD.saveAsTextFile("oper-result")
}
}