spark的累加器解析及小案例

最新推荐文章于 2024-06-02 10:08:44 发布

zhouyanjun_

最新推荐文章于 2024-06-02 10:08:44 发布

阅读量515

点赞数

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/qq_39437513/article/details/108838954

版权

Spark 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

val sum = sc.longAccumulator("sum1")

/**
   * Create and register a long accumulator, which starts with 0 and accumulates inputs by `add`.
   */
  def longAccumulator(name: String): LongAccumulator = {
    val acc = new LongAccumulator
    register(acc, name)
    acc
  }

/**
 * An [[AccumulatorV2 accumulator]] for computing sum, count, and average of 64-bit integers.
 *
 * @since 2.0.0
 */
class LongAccumulator extends AccumulatorV2[jl.Long, jl.Long] { //这里需要传入两个参数，这两个参数干什么呢？这个和业务有关系。点进去。
  private var _sum = 0L
  private var _count = 0L

/**
   * Creates a new copy of this accumulator, which is zero value. i.e. call `isZero` on the copy
   * must return true.
   */
  def copyAndReset(): AccumulatorV2[IN, OUT] = { //输入什么，输出什么
    val copyAcc = copy()
    copyAcc.reset()
    copyAcc
  }

学习别人写的累加器，然后自己来写

class LongAccumulator extends AccumulatorV2[jl.Long, jl.Long] {
  private var _sum = 0L
  private var _count = 0L

  /**
   * Returns false if this accumulator has had any values added to it or the sum is non-zero.
   *
   * @since 2.0.0
   */
  override def isZero: Boolean = _sum == 0L && _count == 0

  override def copy(): LongAccumulator = {
    val newAcc = new LongAccumulator //new个新累加器
    newAcc._count = this._count//当前_count赋值给新的累加器的_count
    newAcc._sum = this._sum
    newAcc //把新累加器返回  也就是说，这个方法相当于创建了个新累加器对象。我们自定义累加器时，没有这两个属性
  }

  override def reset(): Unit = {
    _sum = 0L
    _count = 0L
  }

  /**
   * Adds v to the accumulator, i.e. increment sum by v and count by 1.
   * @since 2.0.0
   */
  override def add(v: jl.Long): Unit = {
    _sum += v
    _count += 1
  }

  /**
   * Adds v to the accumulator, i.e. increment sum by v and count by 1.
   * @since 2.0.0
   */
  def add(v: Long): Unit = {
    _sum += v
    _count += 1
  }

  /**
   * Returns the number of elements added to the accumulator.
   * @since 2.0.0
   */
  def count: Long = _count

  /**
   * Returns the sum of elements added to the accumulator.
   * @since 2.0.0
   */
  def sum: Long = _sum

  /**
   * Returns the average of elements added to the accumulator.
   * @since 2.0.0
   */
  def avg: Double = _sum.toDouble / _count

  override def merge(other: AccumulatorV2[jl.Long, jl.Long]): Unit = other match {
    case o: LongAccumulator =>
      _sum += o.sum
      _count += o.count
    case _ =>
      throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  private[spark] def setValue(newValue: Long): Unit = _sum = newValue

  override def value: jl.Long = _sum
}

累加器，累加器，需要一个集合。累加器省略了shuffle的过程，效率更高一点

object accumulator03_define {
  def main(args: Array[String]): Unit = {
    //2 创建SparkConf并设置App名称
    val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")

    //1 创建SparkContext，该对象是提交Spark App的入口
    val sc: SparkContext = new SparkContext(conf)

    //3 创建RDD
    val rdd = sc.makeRDD(List("Hello", "Hello", "Hello", "Hello", "Spark", "Spark"), 2)
**//怎么使用累加器方法呢，我们得跟别人学**

    **//3.1 创建累加器对象**
    val acc = new MyAccumulator()
    //3.2 注册累加器
    sc.register(acc, "WordCount")
    //3.3 使用累加器
    rdd.foreach({  **//rdd里面有多个元素，我们要对RDD里面的元素做遍历迭代**
      word => {   **//相当于我们把传入的元素划分不同的分区，交给不同的Executor去执行**
      acc.add(word)    **//累加的方法已经在下面累加器对象中写了。累计逻辑已经写过了**
      }
    })

    //3.4输出累加器的结果
    println(acc.value) //Map(Hello -> 4)  **获取累加器里面维护的map集合**

    //关闭连接
    sc.stop()

  }

  // 声明累加器
  // 1.继承AccumulatorV2,设定输入、输出泛型
  // 2.重写方法
  class MyAccumulator extends AccumulatorV2[String, mutable.Map[String, Long]] {

    **//定义个集合，集合单词及出现次数。累加器，需要一个集合。对集合中的内容进行累加**
    var map = mutable.Map[String, Long]() //要么new个对象。要么使用scala中apply伴生对象的形式直接写出来。后面加个()，不加()就只是个类型的声明

    //是否为初始化状态，如果集合数据为空，即为初始化状态
    override def isZero: Boolean = map.isEmpty

    // 复制累加器
    override def copy(): AccumulatorV2[String, mutable.Map[String, Long]] = {
		val newAcc = new MyAccumulator 
    newAcc.map = this.map**//模仿别人写的累加器，我们也这样写**
    newAcc 
		
		}

    // 重置累加器。清空集合元素。让集合为空
    override def reset(): Unit = map.clear()  **//参考别人写的，我们这里是集合，集合自己内部有相应的方法**

    **// 向累加器中添加元素**
    override def add(**v**: String): Unit = { **//v就是当前RDD中的元素**
      //业务逻辑
      if (v.startsWith("H")) {
				**//①在map集合里添加元素②兼具更新元素的功能**
        map(v) = map.getOrElse(v, 0L) + 1L **//往map集合里添加数据，那么就需要有key和value**
      }
    }

    // 合并累加器。要把每个Executor的累加器(map集合)在driver端做合并
    override def merge(other: AccumulatorV2[String, mutable.Map[String, Long]]): Unit = {
      other.value.foreach({
        case (word, count) => {
          map(word) = map.getOrElse(word, 0L) + count **//map是累加器里面维护的map集合**
        }
      })
    }

    // 获取累加器的值。把map值返回去就行了
    override def value: mutable.Map[String, Long] = map
  }

}

**//怎么使用累加器方法呢，我们得跟别人学**
val sum = sc.longAccumulator("sum1")

def longAccumulator(name: String): LongAccumulator = {
    val acc = new LongAccumulator
    register(acc, name) //在SparkContext.scala 包下，所以这个方法应该是 this.register(acc, name)。this就是SparkContext对象
    acc
  }

def register(acc: AccumulatorV2[_, _], name: String): Unit = {
    acc.register(this, name = Option(name))
  }

zhouyanjun_

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
spark的累加器解析及小案例

val sum = sc.longAccumulator("sum1")/** * Create and register a long accumulator, which starts with 0 and accumulates inputs by `add`. */ def longAccumulator(name: String): LongAccumulator = { val acc = new LongAccumulator register(acc, n
复制链接

扫一扫

专栏目录