private val hashAcc =newmutable.HashMap[String, Int]()
override def isZero: Boolean = hashAcc.isEmpty
override def copy(): AccumulatorV2[String, mutable.HashMap[String, Int]]={
val newAcc =newAccWordCount//可能存在多个task同时写入,出现线程安全问题
hashAcc.synchronized{
newAcc.hashAcc ++=this.hashAcc
}
newAcc
}
override def reset(): Unit = hashAcc.clear()//局部累加
override def add(v: String): Unit ={
hashAcc.synchronized{
hashAcc.get(v) match
{case None => hashAcc +=((v,1))caseSome(x)=> hashAcc +=((v, x +1))}}}//全局累加
override def merge(other: AccumulatorV2[String, mutable.HashMap[String, Int]]): Unit ={
other match
{case o: AccumulatorV2[String, mutable.HashMap[String, Int]]=>{for((k, v)<- o.value){
hashAcc.get(k) match
{case None => hashAcc +=((k, v))caseSome(x)=> hashAcc +=((k, v +1))}}}}}
override def value: mutable.HashMap[String, Int]= hashAcc
使用
//模板代码private val conf: SparkConf =newSparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")private val sc =newSparkContext(conf)//数据private val rdd: RDD[String]= sc.makeRDD(List("hello","world","python","spark","spark","hello","scala","spark","python"))//创建累加器对象private val acc =newAccWordCount
sc.register(acc,"acc")//分布式累加
rdd.foreach(x => acc.add(x))println(acc.value)