val sum = sc.longAccumulator("sum1")
/**
* Create and register a long accumulator, which starts with 0 and accumulates inputs by `add`.
*/
def longAccumulator(name: String): LongAccumulator = {
val acc = new LongAccumulator
register(acc, name)
acc
}
/**
* An [[AccumulatorV2 accumulator]] for computing sum, count, and average of 64-bit integers.
*
* @since 2.0.0
*/
class LongAccumulator extends AccumulatorV2[jl.Long, jl.Long] { //这里需要传入两个参数,这两个参数干什么呢?这个和业务有关系。点进去。
private var _sum = 0L
private var _count = 0L
/**
* Creates a new copy of this accumulator, which is zero value. i.e. call `isZero` on the copy
* must return true.
*/
def copyAndReset(): AccumulatorV2[IN, OUT] = { //输入什么,输出什么
val copyAcc = copy()
copyAcc.reset()
copyAcc
}
学习别人写的累加器,然后自己来写
class LongAccumulator extends AccumulatorV2[jl.Long, jl.Long] {
private var _sum = 0L
private var _count = 0L
/**
* Returns false if this accumulator has had any values added to it or the sum is non-zero.
*
* @since 2.0.0
*/
override def isZero: Boolean = _sum == 0L && _count == 0
override def copy(): LongAccumulator = {
val newAcc = new LongAccumulator //new个新累加器
newAcc._count = this._count//当前_count赋值给新的累加器的_count
newAcc._sum = this._sum
newAcc //把新累加器返回 也就是说,这个方法相当于创建了个新累加器对象。我们自定义累加器时,没有这两个属性
}
override def reset(): Unit = {
_sum = 0L
_count = 0L
}
/**
* Adds v to the accumulator, i.e. increment sum by v and count by 1.
* @since 2.0.0
*/
override def add(v: jl.Long): Unit = {
_sum += v
_count += 1
}
/**
* Adds v to the accumulator, i.e. increment sum by v and count by 1.
* @since 2.0.0
*/
def add(v: Long): Unit = {
_sum += v
_count += 1
}
/**
* Returns the number of elements added to the accumulator.
* @since 2.0.0
*/
def count: Long = _count
/**
* Returns the sum of elements added to the accumulator.
* @since 2.0.0
*/
def sum: Long = _sum
/**
* Returns the average of elements added to the accumulator.
* @since 2.0.0
*/
def avg: Double = _sum.toDouble / _count
override def merge(other: AccumulatorV2[jl.Long, jl.Long]): Unit = other match {
case o: LongAccumulator =>
_sum += o.sum
_count += o.count
case _ =>
throw new UnsupportedOperationException(
s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
}
private[spark] def setValue(newValue: Long): Unit = _sum = newValue
override def value: jl.Long = _sum
}
累加器,累加器,需要一个集合。累加器省略了shuffle的过程,效率更高一点
object accumulator03_define {
def main(args: Array[String]): Unit = {
//2 创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//1 创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3 创建RDD
val rdd = sc.makeRDD(List("Hello", "Hello", "Hello", "Hello", "Spark", "Spark"), 2)
**//怎么使用累加器方法呢,我们得跟别人学**
**//3.1 创建累加器对象**
val acc = new MyAccumulator()
//3.2 注册累加器
sc.register(acc, "WordCount")
//3.3 使用累加器
rdd.foreach({ **//rdd里面有多个元素,我们要对RDD里面的元素做遍历迭代**
word => { **//相当于我们把传入的元素划分不同的分区,交给不同的Executor去执行**
acc.add(word) **//累加的方法已经在下面累加器对象中写了。累计逻辑已经写过了**
}
})
//3.4输出累加器的结果
println(acc.value) //Map(Hello -> 4) **获取累加器里面维护的map集合**
//关闭连接
sc.stop()
}
// 声明累加器
// 1.继承AccumulatorV2,设定输入、输出泛型
// 2.重写方法
class MyAccumulator extends AccumulatorV2[String, mutable.Map[String, Long]] {
**//定义个集合,集合单词及出现次数。累加器,需要一个集合。对集合中的内容进行累加**
var map = mutable.Map[String, Long]() //要么new个对象。要么使用scala中apply伴生对象的形式直接写出来。后面加个(),不加()就只是个类型的声明
//是否为初始化状态,如果集合数据为空,即为初始化状态
override def isZero: Boolean = map.isEmpty
// 复制累加器
override def copy(): AccumulatorV2[String, mutable.Map[String, Long]] = {
val newAcc = new MyAccumulator
newAcc.map = this.map**//模仿别人写的累加器,我们也这样写**
newAcc
}
// 重置累加器。清空集合元素。让集合为空
override def reset(): Unit = map.clear() **//参考别人写的,我们这里是集合,集合自己内部有相应的方法**
**// 向累加器中添加元素**
override def add(**v**: String): Unit = { **//v就是当前RDD中的元素**
//业务逻辑
if (v.startsWith("H")) {
**//①在map集合里添加元素②兼具更新元素的功能**
map(v) = map.getOrElse(v, 0L) + 1L **//往map集合里添加数据,那么就需要有key和value**
}
}
// 合并累加器。要把每个Executor的累加器(map集合)在driver端做合并
override def merge(other: AccumulatorV2[String, mutable.Map[String, Long]]): Unit = {
other.value.foreach({
case (word, count) => {
map(word) = map.getOrElse(word, 0L) + count **//map是累加器里面维护的map集合**
}
})
}
// 获取累加器的值。把map值返回去就行了
override def value: mutable.Map[String, Long] = map
}
}
**//怎么使用累加器方法呢,我们得跟别人学**
val sum = sc.longAccumulator("sum1")
def longAccumulator(name: String): LongAccumulator = {
val acc = new LongAccumulator
register(acc, name) //在SparkContext.scala 包下,所以这个方法应该是 this.register(acc, name)。this就是SparkContext对象
acc
}
def register(acc: AccumulatorV2[_, _], name: String): Unit = {
acc.register(this, name = Option(name))
}