参考
场景
transformation类算子:map、flatMap、reduceByKey、join与cogroup实战
实验
scala版
package main.scala
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.util.collection.CompactBuffer
/**
* 常用transformation算子实战
* map、filter
*/
object Trasformation {
def main(args: Array[String]): Unit = {
val sc = sparkContext("just for fun")
mapTransformation(sc)
filterTransformation(sc)
flatMapTransformation(sc)
groupByKeyTransformation(sc)
reduceByKeyTransformation(sc)
joinTransformation(sc)
leftJoinTransformation(sc)
rightJoinTransformation(sc)
cogroupTransformation(sc)
sc.stop()
}
def sparkContext(name:String*):SparkContext = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Trasformation Ops")
new SparkContext(conf)
}
/*
* def map[U](f: Int => U)(implicit evidence$243: ClassTag[U]): RDD[U]
*/
def mapTransformation(sc:SparkContext){
val nums = sc.parallelize(1 to 10)
nums.map(elem => elem*2).collect().foreach(println)
}
/*
* def filter(f: T => Boolean): RDD[T]
*/
def filterTransformation(sc:SparkContext){
val nums = sc.parallelize(1 to 10)
nums.filter { elem => elem%2==0 }.collect().foreach { println }
}
/*
* def flatMap[U](f: String => TraversableOnce[U])(implicit evidence$204: ClassTag[U]): RDD[U]
*/
def flatMapTransformation(sc:SparkContext){
val bigData = Array("Scala Spark","Java Hadoop")
val strRDD = sc.parallelize(bigData)
strRDD.flatMap { line => line.split(" ") }.collect().foreach { println }
}
/*
* def groupByKey(): RDD[(Int, Iterable[String])]
*/
def groupByKeyTransformation(sc:SparkContext){
val scores = Array(Tuple2(100,"Spark"),Tuple2(90,"flink"),Tuple2(60,"hadoop"),Tuple2(90,"docker"))
sc.parallelize(scores).groupByKey().collect().foreach(println)
}
def reduceByKeyTransformation(sc:SparkContext){
sc.textFile("file:home/pengyucheng/java/rdd2dfram.txt").flatMap(line=>line.split(" ")).map(pair=>(pair,1)).reduceByKey(_+_).collect().foreach(println)
}
/*
* def join[W](other: RDD[(Int, W)]): RDD[(Int, (String, W))]
*/
def joinTransformation(sc:SparkContext){
val studentNames =Array(Tuple2(1,"Spark"),Tuple2(2,"hadoop"),Tuple2(3,"Tachyon"))
val studentScores = Array(Tuple2(1,100),Tuple2(2,60),Tuple2(3,90))
val names = sc.parallelize(studentNames)
val scores = sc.parallelize(studentScores)
names.join(scores).collect().foreach(println)
}
/*
* def join[W](other: RDD[(Int, W)]): RDD[(Int, (String, W))]
*/
def leftJoinTransformation(sc:SparkContext){
val studentNames =Array(Tuple2(1,"Spark"),Tuple2(2,"hadoop"),Tuple2(3,"Tachyon"),Tuple2(4,"Other"))
val studentScores = Array(Tuple2(1,100),Tuple2(2,60),Tuple2(3,90))
val names = sc.parallelize(studentNames)
val scores = sc.parallelize(studentScores)
names.leftOuterJoin(scores).collect().foreach(println)
}
/*
* def join[W](other: RDD[(Int, W)]): RDD[(Int, (String, W))]
*/
def rightJoinTransformation(sc:SparkContext){
val studentNames =Array(Tuple2(1,"Spark"),Tuple2(2,"hadoop"),Tuple2(3,"Tachyon"),Tuple2(4,"Other"))
val studentScores = Array(Tuple2(1,100),Tuple2(2,60),Tuple2(3,90))
val names = sc.parallelize(studentNames)
val scores = sc.parallelize(studentScores)
names.rightOuterJoin(scores).collect().foreach(println)
}
/*
* def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]
*/
def cogroupTransformation(sc:SparkContext){
val studentNames =Array(Tuple2(1,"spark"),Tuple2(2,"tackyon"),Tuple2(3,"hadoop"))
val studentScores = Array(Tuple2(1,100),Tuple2(2,90),Tuple2(3,70),Tuple2(1,110),Tuple2(2,95),Tuple2(2,60))
val names = sc.parallelize(studentNames)
val scores = sc.parallelize(studentScores)
names.cogroup(scores).collect().foreach(println)
}
}
执行结果
-reduceByKey
16/05/27 09:38:17 INFO DAGScheduler: ResultStage 2 (collect at Trasformation.scala:43) finished in 0.036 s
Scala
Spark
Java
Hadoop
16/05/27 09:38:17 INFO DAGScheduler: Job 2 finished: collect at Trasformation.scala:43, took 0.062018 s
-groupByKey
16/05/27 09:38:17 INFO DAGScheduler: Job 3 finished: collect at Trasformation.scala:48, took 0.226175 s
(100,CompactBuffer(Spark))
(90,CompactBuffer(flink, docker))
(60,CompactBuffer(hadoop))
16/05/27 09:38:17 INFO SparkUI: Stopped Spark web UI at http://192.168.0.5:4040
-join
16/05/27 10:01:57 INFO DAGScheduler: Job 5 finished: collect at Trasformation.scala:65, took 0.099787 s
(2,(hadoop,60))
(1,(Spark,100))
(3,(Tachyon,90))
16/05/27 10:01:57 INFO SparkUI: Stopped Spark web UI at http://192.168.0.5:4041
-leftjoin
16/05/27 10:12:55 INFO DAGScheduler: Job 0 finished: collect at Trasformation.scala:86, took 0.805630 s
(4,(Other,None))
(2,(hadoop,Some(60)))
(1,(Spark,Some(100)))
(3,(Tachyon,Some(90)))
16/05/27 10:12:55 INFO SparkContext: Starting job: collect at Trasformation.scala:99
-rightjoin
16/05/27 10:12:55 INFO DAGScheduler: Job 1 finished: collect at Trasformation.scala:99, took 0.139523 s
(2,(Some(hadoop),60))
(1,(Some(Spark),100))
(3,(Some(Tachyon),90))
16/05/27 10:12:55 INFO ContextCleaner: Cleaned accumulator 3
-cogroup
16/05/27 11:10:03 INFO DAGScheduler: Job 0 finished: collect at Trasformation.scala:110, took 0.866773 s
(2,(CompactBuffer(tackyon),CompactBuffer(90, 95, 60)))
(1,(CompactBuffer(spark),CompactBuffer(100, 110)))
(3,(CompactBuffer(hadoop),CompactBuffer(70)))
16/05/27 11:10:03 INFO SparkUI: Stopped Spark web UI at http://192.168.0.5:4040
java版(cogroup)
package cool.pengych.spark.core;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
//cogroup 算子java版
public class CogroupTest {
public static void main(String[] args)
{
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("RDD2DataFrameByReflection");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer,String>> nameList = Arrays.asList(
new Tuple2<Integer,String>(1,"spark"),
new Tuple2<Integer,String>(2,"tackyon"),
new Tuple2<Integer,String>(3,"hadoop")
);
List<Tuple2<Integer,Integer>> scoreList = Arrays.asList(
new Tuple2<Integer,Integer>(1,100),
new Tuple2<Integer,Integer>(2,90),
new Tuple2<Integer,Integer>(3,70),
new Tuple2<Integer,Integer>(1,110),
new Tuple2<Integer,Integer>(2,95),
new Tuple2<Integer,Integer>(2,60)
);
JavaPairRDD<Integer, String> names = sc.parallelizePairs(nameList);
JavaPairRDD<Integer, Integer> scores = sc.parallelizePairs(scoreList);
/*
* <Integer> JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> org.apache.spark.api.java.JavaPairRDD.cogroup(JavaPairRDD<Integer, Integer> other)
*/
JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> cogroups = names.cogroup(scores);
cogroups.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> t) throws Exception {
System.out.println("ID:" + t._1);
System.out.println("Name:" + t._2._1);
System.out.println("Score:" + t._2._2);
}
});
sc.close();
}
}
执行结果
16/05/27 10:57:56 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 3 ms
ID:2
Name:[tackyon]
Score:[90, 95, 60]
16/05/27 10:57:56 INFO Executor: Finished task 0.0 in stage 2.0 (TID 4). 1165 bytes result sent to driver
ID:1
Name:[spark]
Score:[100, 110]
ID:3
Name:[hadoop]
Score:[70]
16/05/27 10:57:56 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 4) in 141 ms on localhost (1/2)
总结
1、join、leftOuterJoin、rightOuterJoin相当于关系型数据库中的join等操作。leftOuterJoin:以左边的RDD为基准进行join,右边的RDD中的相关元素可能为空,所以用 Some进行了封装;rightOuterJoin同理.
2、cogroup:先join 然后再group.
3、CompactBuffer(Spark API居然找不到这个类的说明,源码有说明:org.apache.spark.util.collection.CompactBuffer) & ArrayBuffer:对于 groupBy等操作后的entry中同一K对应的V数量比较小的情况下,CompactBuffer比ArrayBuffer更高效。
“
* An append-only buffer similar to ArrayBuffer, but more memory-efficient for small buffers.
* ArrayBuffer always allocates an Object array to store the data, with 16 entries by default,
* so it has about 80-100 bytes of overhead. In contrast, CompactBuffer can keep up to two
* elements in fields of the main object, and only allocates an Array[AnyRef] if there are more
* entries than that. This makes it more efficient for operations like groupBy where we expect
* some keys to have very few elements.
”