Spark common join vs map join
common join
demo
scala> val rdd1=sc.parallelize(List((1,"a"),(2,"b"),(1,"c"),(2,"d"),(3,"e")))
rdd1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at parallelize at <console>:24
scala> val data=sc.parallelize(List((1,"AA"),(2,"BB")))
data: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[1] at parallelize at <console>:24
scala> rdd1.join(data).foreach(println(_))
(1,(a,AA))
(1,(c,AA))
(2,(b,BB))
(2,(d,BB))
ui
普通join会产生shuffle
map join
Spark 使用广播变量小表进行广播
scala> val rdd1=sc.parallelize(List((1,"a"),(2,"b"),(1,"c"),(2,"d"),(3,"e")))
rdd1: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at parallelize at <console>:24
scala> val data=sc.parallelize(List((1,"AA"),(2,"BB")))
data: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[1] at parallelize at <console>:24
scala> val broadcastdata=sc.broadcast(data.collectAsMap())
broadcastdata: org.apache.spark.broadcast.Broadcast[scala.collection.Map[Int,String]] = Broadcast(1)
scala> rdd1.mapPartitions(x=>{
| val mapdata=broadcastdata.value
| for((key,value) <- x if(mapdata.contains(key)))
| yield (key,mapdata.get(key),value)
| }).foreach(println(_))
(1,Some(AA),a)
(2,Some(BB),b)
(1,Some(AA),c)
(2,Some(BB),d)
ui
map join 没有产生shuffle