广播变量(groadcast varible)为只读变量,使用广播变量的好处:每个节点的executor有一个副本,不是每个task有一个副本,可以优化资源提高性能,比如机器学习的时候。
累加器:累加器可以在各个executor之间共享,修改,其中有几种创建方法
object AccumulatorTest { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("test") val sc = new SparkContext(conf) val longAccumulator = sc.longAccumulator("count mapped data") val collectionAccumulator = sc.collectionAccumulator[String]("collect mapped data") val mapAccumulator = new CustomAccumulator sc.register(mapAccumulator) val logData = sc.parallelize(Seq("plane", "fish", "duck", "dirty", "people", "plane"), 2) logData.foreach(str => { if (str == "plane") { longAccumulator.add(1L) } try { // some code } catch { case e: Exception => { collectionAccumulator.add(e.getMessage) } } mapAccumulator.add(str) }) longAccumulator.sum // 6 collectionAccumulator.value // "plane", "fish", "duck", "dirty", "people", "plane" mapAccumulator.value //"plane -> 2", "fish -> 1", "duck -> 1", "dirty -> 1", "people -> 1", } }