累加器
- 一个分布式的只写变量,用来把 executor 端变量信息聚合到driver,在driver 中聚合数据,在executor中的每个Task 都会得到这个数据的一个新副本,每个task 更新这些副本的之后 回传到driver 进行merge
系统累加器
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("Spark_Acc")
val sc = new SparkContext(sparkConf)
val rdd: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4), 2)
// 获取系统累加器
val sumAcc = sc.longAccumulator("sumAcc")
rdd.foreach(num => {
sumAcc.add(num)
})
println(sumAcc.value)
sc.stop()
}
自定义累加器
范例说明:自定义累加器实现 WordCount
object WordCount_ByAcc {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount_ByAcc")
val sc = new SparkContext(sparkConf)
//1.读取文件
val lines = sc.textFile("/datas/*",3)
val rdd: RDD[String] = lines.flatMap(_.split(" "))
val accumulator = new MyAccumulator()
sc.register(accumulator,"WordCountAcc");
rdd.foreach(
word=>{
accumulator.add(word)
}
)
println(accumulator.value)
sc.stop()
}
/**
* 自定义累加器
*
* 继承AccumulatorV2,定义范型
* IN 累加器输入的数据类型
* OUT 累加器输出的数据类型
*/
class MyAccumulator extends AccumulatorV2[String,mutable.Map[String,Long]] {
private val map=mutable.Map[String,Long]()
override def isZero: Boolean = {
map.isEmpty
}
override def copy(): AccumulatorV2[String, mutable.Map[String, Long]] = {
new MyAccumulator()
}
override def reset(): Unit = {
map.clear()
}
//获取输入
override def add(v: String): Unit ={
val newCnt = map.getOrElse(v, 0L) + 1
map.update(v,newCnt)
}
//合并累加器
override def merge(other: AccumulatorV2[String, mutable.Map[String, Long]]): Unit = {
val map1 = this.map
val map2 = other.value
map2.foreach {
case (word, count) => {
val newCnt = map1.getOrElse(word, 0L) + count
map1.update(word, newCnt)
}
}
}
//输出累加器的值
override def value: mutable.Map[String, Long] = {
map
}
}
}
广播变量
- 将闭包中的变量数据放到Excutor 内存中,以达到 变量在多个 Task 中共享的目的
- spark 中的广播变量不能更改
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount_ByBroadcast")
val sc = new SparkContext(sparkConf)
val rdd: RDD[(String, Int)] = sc.makeRDD(List(("a", 1), ("b", 2), ("c", 3)))
val map=mutable.Map(("a",4),("b",5),("c",6))
val bc: Broadcast[mutable.Map[String, Int]] = sc.broadcast(map)
rdd.map{
case (w,c)=>{
val i = bc.value.getOrElse(w, 0)
(w,(c,i))
}
}.collect().foreach(println)
}