总结:
- 当Task中使用到了
本地的变量
,而该变量不是经常变化且数据量不大
- 那么可以使用广播机制,将该变量发送到各个Executor内存中,这样各个Task就可以从Executor中直接获取并使用
避免了Driver将该变量单独发给各个Task
- 现象比较难理解,但是代码很简单
package cn.hanjiaxiaozhi.core
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object BroadcastTest {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("wc").setMaster("local[*]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("WARN")
val fruitMap: collection.Map[Int, String] = Map((1, "apple"), (2, "orange"), (3, "banana"), (4, "grape"))
val fruitIds: RDD[Int] = sc.parallelize(List(2, 4, 1, 3))
val fruitNames: RDD[String] = fruitIds.map(id => fruitMap(id))
fruitNames.collect().foreach(println)
println("============================")
val mapBroadcast: Broadcast[collection.Map[Int, String]] = sc.broadcast(fruitMap)
val fruitNames2: RDD[String] = fruitIds.map(id => {
val map: collection.Map[Int, String] = mapBroadcast.value
map(id)
})
fruitNames2.collect().foreach(println)
}
}