关于SparkSQL中UDAF的一次设计失误

首先我们先贴一下同事的UDAF函数

package com.wby.fans.common
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{StringType, _}
object OrderUDAF {
  //  //接单数 拒单数 过检订单数 总订单数
  //  //order_acceptance_num   order_reject_num order_pass_num order_num
  class DealOrderNum() extends UserDefinedAggregateFunction {

    override def inputSchema: StructType = new StructType().add("status", IntegerType).add("order_id",IntegerType)

    override def bufferSchema: StructType = new StructType()
      .add("orderNum", IntegerType) //订单数
      .add("orderAcceptNum", IntegerType) //接单数
      .add("orderRejectNum", IntegerType) //拒单数
      .add("orderPassNum", IntegerType) //过检订单数
      .add("orderStauts", IntegerType) //订单状态
    override def dataType: DataType = StringType
    override def deterministic: Boolean = true
    override def initialize(buffer: MutableAggregationBuffer): Unit = {
      buffer.update(0, 0)
      buffer.update(1, 0)
      buffer.update(2, 0)
      buffer.update(3, 0)
      buffer.update(4, 0)
    }
    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      val orderStatus = input.getInt(0)
      var orderNum = buffer.getInt(0)
      val orderAcceptNum = buffer.getInt(1)
      val orderRejectNum = buffer.getInt(2)
      val orderPassNum = buffer.getInt(3)
      val bufferStatus = buffer.getInt(4)
      //订单数计算
      buffer.update(0,orderNum + 1)
      //订单状态更新
      buffer.update(4, orderStatus)
      //接单数计算
      if ((orderStatus equals 32) || (orderStatus equals 34) || (orderStatus equals 22) || (orderStatus equals 30) || (orderStatus equals 21) || (orderStatus equals 2) || (orderStatus equals 31) || (orderStatus equals 27) || (orderStatus equals 26) || (orderStatus equals 36) || (orderStatus equals 25) || (orderStatus equals 12) || (orderStatus equals 33) || (orderStatus equals 35) || (orderStatus equals 13) || (orderStatus equals 15)) {
        buffer.update(1, 1 + orderAcceptNum)
      }
      //拒单数
      if (orderStatus equals 3) {
        buffer.update(2, 1 + orderRejectNum)
      }
      //过检订单数
      if ((orderStatus equals 28) || (orderStatus equals 32) || (orderStatus equals 33) || (orderStatus equals 34) || (orderStatus equals 35)) {
        buffer.update(3, 1 + orderPassNum)
      }
    }
    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      val orderNum1 = buffer1.getInt(0)
      val orderAcceptNum1 = buffer1.getInt(1)
      val orderRejectNum1 = buffer1.getInt(2)
      val orderPassNum1 = buffer1.getInt(3)
      val orderStatus1 = buffer1.getInt(4)
      val orderNum2 = buffer2.getInt(0)
      val orderAcceptNum2 = buffer2.getInt(1)
      val orderRejectNum2 = buffer2.getInt(2)
      val orderPassNum2 = buffer2.getInt(3)
      val orderStatus2 = buffer2.getInt(4)
      buffer1.update(0, orderNum1 + orderNum2)
//      buffer1.update(1,orderStatus2)
      //接单数计算32,34,22,30,21,2,31,27,26,36,25,12,33,35,13,15
      if ((orderStatus2 equals 32) || (orderStatus2 equals 34) || (orderStatus2 equals 22) || (orderStatus2 equals 30) || (orderStatus2 equals 21) || (orderStatus2 equals 2) || (orderStatus2 equals 31) || (orderStatus2 equals 27) || (orderStatus2 equals 26) || (orderStatus2 equals 36) || (orderStatus2 equals 25) || (orderStatus2 equals 12) || (orderStatus2 equals 33) || (orderStatus2 equals 35) || (orderStatus2 equals 13) || (orderStatus2 equals 15)) {
        buffer1.update(1,orderAcceptNum1 + orderAcceptNum2)
      }
      //拒单数
      if (orderStatus2 equals 3) {
        buffer1.update(2,orderRejectNum1 + orderRejectNum2)
      }
      //过检订单数28, 32, 33, 34, 35
      if ((orderStatus2 equals 28) || (orderStatus2 equals 32) || (orderStatus2 equals 33) || (orderStatus2 equals 34) || (orderStatus2 equals 35)) {
        buffer1.update(3,orderPassNum1 + orderPassNum2)
      }
    }
    override def evaluate(buffer: Row): Any = {
      val orderNum = buffer.getInt(0)
      val orderAcceptNum = buffer.getInt(1)
      val orderRejectNum = buffer.getInt(2)
      val orderPassNum = buffer.getInt(3)
      val orderPassRate = orderPassNum.toDouble / orderNum.toDouble
      val result = s"${orderNum}_${orderAcceptNum}_${orderRejectNum}_${orderPassNum}_${orderPassRate}"
      //println(result)
      //拒单 status_name = 拒单
      result
    }
  }
}

我们说,UDAF分为update,merge,evaluate三大板块
这个UDAF的错误在于:merge不是一个可分布的算法
在这里插入图片描述
如果update的最后一条,,不是28, 32, 33, 34, 35,,那么进入这个merge,即使里面有orderPassNum,也直接忽略,不进入求和,,,因为orderStatus2里面是放的每个update的最后一条的信息。。
预期是求各状态的和,但是代码是:根据最后一条状态聚合,这不是一个可分布的算法。
这也解释了,,你指定Status时,数据很正常。。。

修改之后的merge部分代码:

    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      val orderNum1 = buffer1.getInt(0)
      val orderAcceptNum1 = buffer1.getInt(1)
      val orderRejectNum1 = buffer1.getInt(2)
      val orderPassNum1 = buffer1.getInt(3)
      val orderNum2 = buffer2.getInt(0)
      val orderAcceptNum2 = buffer2.getInt(1)
      val orderRejectNum2 = buffer2.getInt(2)
      val orderPassNum2 = buffer2.getInt(3)
        buffer1.update(0, orderNum1 + orderNum2)
        buffer1.update(1,orderAcceptNum1 + orderAcceptNum2)
        buffer1.update(2,orderRejectNum1 + orderRejectNum2)
        buffer1.update(3,orderPassNum1 + orderPassNum2)
    }

done!

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值