Spark自定义AccumulatorV2
1.概述###
- AccumulatorV2的简单使用与注意事项见上一遍博客https://blog.csdn.net/wtzhm/article/details/86481846
- 在实际开发中很多时候需要用到自定义Acummulator,Accumulator太多了,不便于维护,例如项目如果要出现一些逻辑上的变更,比如说,session数量的计算逻辑,要改变,就得更改所有Accumulator对应的代码;或者说,又要增加几个范围,那么又要增加多个Accumulator,并且修改对应的累加代码;维护成本,相当之高。
2. 实例
-
定义一个类继承AcummulatorV2抽象类,实现6个抽象方法
package com.spark.zhmcode.session import com.spark.zhmcode.utils.MyStringUtils import org.apache.spark.util.AccumulatorV2 /** * 第一个为输入类型,第二个为输出类型 */ class MyCustomerAcculatorv2 extends AccumulatorV2[String, String] { var result = "session1=0|session2=0|session3=0|session4=0" /** * isZero: 当AccumulatorV2中存在类似数据不存在这种问题时,是否结束程序 */ override def isZero: Boolean = { result == "session1=0|session2=0|session3=0|session4=0" } /** * 拷贝一个新的AccumulatorV2 */ override def copy(): AccumulatorV2[String, String] = { val accumulator = new MyCustomerAcculatorv2() accumulator.result = this.result accumulator } /** * 重置AccumulatorV2中的数据 */ override def reset(): Unit = { result = "session1=0|session2=0|session3=0|session4=0" } /** * 操作数据累加方法实现 */ override def add(v: String): Unit = { val v1 = result val v2 = v if (MyStringUtils.isNotEmpty(v1) && MyStringUtils.isNotEmpty(v2)) { var newResult="" val oldValue = MyStringUtils.getFieldFromConcatString(v1,"\\|",v2) if(oldValue!=null){ val newValue = oldValue.toInt + 1 newResult = MyStringUtils.setFieldFromConcatString(v1,"\\|",v2,newValue) } result = newResult } } /** * 合并数据 */ override def merge(other: AccumulatorV2[String, String]): Unit = other match { case map: MyCustomerAcculatorv2 => result = other.value case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } /** * AccumulatorV2对外访问的数据结果 */ override def value: String = { result } }
-
MyStringUtils工具类
package com.spark.zhmcode.utils object MyStringUtils { /** * 从拼接的字符串中提取字段的值 * * @param str "session1=0|session2=0|session3=0|session4=0" * @param delimiter 分隔符 * @param field 字段 * @return 字段值 */ def getFieldFromConcatString(str: String, delimiter: String, field: String): String = { val fileds = str.split(delimiter) var result = "0" for (concatField <- fileds) { var fieldAndValue = concatField.split("=") if (fieldAndValue.length == 2) { val key = fieldAndValue(0) val value = fieldAndValue(1) if (field.equals(key)) { result = value } } } result } /** * 从拼接的字符串中设置字段的值 * * @param str "session1=0|session2=0|session3=0|session4=0" * @param delimiter 分隔符 * @param field 字段 * @param newValue 字段对应新的value * @return 字段值 */ def setFieldFromConcatString(str: String, delimiter: String, field: String,newValue:Integer): String = { var result = str val buffer = new StringBuffer("") val fileds = str.split(delimiter) var isExist = false for (concatField <- fileds) { var fieldAndValue = concatField.split("=") if (fieldAndValue.length == 2) { val key = fieldAndValue(0) val value = fieldAndValue(1) if (field.equals(key)) { buffer.append(key).append("=").append(newValue).append("|") isExist = true }else{ buffer.append(key).append("=").append(value).append("|") } } } if(isExist){ var newResult = buffer.toString newResult.substring(0,newResult.length-1) }else{ result } } /** * 判断字符串是否不为空 * * @param str 字符串 * @return 是否不为空 */ def isNotEmpty(str: String): Boolean = str != null && !("" == str) }
-
创建自定义Accumulator,记住一定的注册,不然会抛出异常
package com.spark.zhmcode.session import org.apache.spark.{SparkConf, SparkContext} object MyAccumulator { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local[2]").setAppName("MyAccumulator") val sc = new SparkContext(conf) val data = List("session1","session2","session3","session4","session1","session3","session3","session3","a","b","c","d") val rdd1 = sc.parallelize(data,1) val accumulator = new MyCustomerAcculatorv2() sc.register(accumulator,"countSession") val resultRdd = rdd1.foreach(x=>{ accumulator.add(x) }) println("自定义accumulator统计结果:" + accumulator.value) sc.stop() } }