自定义单词统计累加器
应用场景:Driver端定义一个共享变量,将数据累加到该变量上,如果直接用foreach或map等迭代算子,是无法将累加的变量返回到driver端,因为累加的过程发生在Executor端。一般用于计数场景下,变量 往往声明在Driver端。
特性: 变量在Driver端,累加的过程是在Executor端,在累加的过程Executor端是无法读取其值的,如果想读取其值,
只能在Driver端才能读取。
使用:
1.创建一个Accumulator累加器的实例
2.通过sc.register()注册一个累加器
3.通过累加器实名.add来添加数据
4.通过累加器实例名.value来后去累加器的值
package com.jxlg.accumlator
import org.apache.spark.util.AccumulatorV2
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object AccumulatorV2Demo_3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getName).setMaster("local[2]")
val sc = new SparkContext(conf)
val nums = sc.parallelize(List("uzi jk theshy","jk theshy","uzi ","ksar theshy"),2)
val accumulator =new MyAccumulator_wc()
sc.register(accumulator,"wc")
nums.foreach(line=> accumulator.add(line))
println(accumulator.value)
sc.stop()
}
}
class MyAccumulator_wc extends AccumulatorV2[String,mutable.HashMap[String,Int]]{
private val accMap= new mutable.HashMap[String,Int]()
override def isZero: Boolean = accMap.isEmpty
override def copy(): AccumulatorV2[String, mutable.HashMap[String, Int]] = {
val acc =new MyAccumulator_wc
accMap.synchronized{
acc.accMap ++=accMap
}
acc
}
override def reset(): Unit = accMap.clear
override def add(v: String): Unit = {
val splited: Array[String] = v.split(" ")
splited.map(word => {
accMap.get(word) match {
case Some(x) => accMap += ((word,x+1))
case None => accMap += ((word,1))
}
})
}
override def merge(other: AccumulatorV2[String, mutable.HashMap[String, Int]]): Unit = {
other match {
case a: AccumulatorV2[String, mutable.HashMap[String, Int]] => {
for((k,v)<- a.value){
accMap.get(k) match{
case Some(x) => accMap += ((k,x+v))
case None => accMap +=((k,v))
}
}
}
}
}
override def value: mutable.HashMap[String, Int] = accMap
}