首先我们先贴一下同事的UDAF函数
package com.wby.fans.common
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{StringType, _}
object OrderUDAF {
// //接单数 拒单数 过检订单数 总订单数
// //order_acceptance_num order_reject_num order_pass_num order_num
class DealOrderNum() extends UserDefinedAggregateFunction {
override def inputSchema: StructType = new StructType().add("status", IntegerType).add("order_id",IntegerType)
override def bufferSchema: StructType = new StructType()
.add("orderNum", IntegerType) //订单数
.add("orderAcceptNum", IntegerType) //接单数
.add("orderRejectNum", IntegerType) //拒单数
.add("orderPassNum", IntegerType) //过检订单数
.add("orderStauts", IntegerType) //订单状态
override def dataType: DataType = StringType
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer.update(0, 0)
buffer.update(1, 0)
buffer.update(2, 0)
buffer.update(3, 0)
buffer.update(4, 0)
}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val orderStatus = input.getInt(0)
var orderNum = buffer.getInt(0)
val orderAcceptNum = buffer.getInt(1)
val orderRejectNum = buffer.getInt(2)
val orderPassNum = buffer.getInt(3)
val bufferStatus = buffer.getInt(4)
//订单数计算
buffer.update(0,orderNum + 1)
//订单状态更新
buffer.update(4, orderStatus)
//接单数计算
if ((orderStatus equals 32) || (orderStatus equals 34) || (orderStatus equals 22) || (orderStatus equals 30) || (orderStatus equals 21) || (orderStatus equals 2) || (orderStatus equals 31) || (orderStatus equals 27) || (orderStatus equals 26) || (orderStatus equals 36) || (orderStatus equals 25) || (orderStatus equals 12) || (orderStatus equals 33) || (orderStatus equals 35) || (orderStatus equals 13) || (orderStatus equals 15)) {
buffer.update(1, 1 + orderAcceptNum)
}
//拒单数
if (orderStatus equals 3) {
buffer.update(2, 1 + orderRejectNum)
}
//过检订单数
if ((orderStatus equals 28) || (orderStatus equals 32) || (orderStatus equals 33) || (orderStatus equals 34) || (orderStatus equals 35)) {
buffer.update(3, 1 + orderPassNum)
}
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val orderNum1 = buffer1.getInt(0)
val orderAcceptNum1 = buffer1.getInt(1)
val orderRejectNum1 = buffer1.getInt(2)
val orderPassNum1 = buffer1.getInt(3)
val orderStatus1 = buffer1.getInt(4)
val orderNum2 = buffer2.getInt(0)
val orderAcceptNum2 = buffer2.getInt(1)
val orderRejectNum2 = buffer2.getInt(2)
val orderPassNum2 = buffer2.getInt(3)
val orderStatus2 = buffer2.getInt(4)
buffer1.update(0, orderNum1 + orderNum2)
// buffer1.update(1,orderStatus2)
//接单数计算32,34,22,30,21,2,31,27,26,36,25,12,33,35,13,15
if ((orderStatus2 equals 32) || (orderStatus2 equals 34) || (orderStatus2 equals 22) || (orderStatus2 equals 30) || (orderStatus2 equals 21) || (orderStatus2 equals 2) || (orderStatus2 equals 31) || (orderStatus2 equals 27) || (orderStatus2 equals 26) || (orderStatus2 equals 36) || (orderStatus2 equals 25) || (orderStatus2 equals 12) || (orderStatus2 equals 33) || (orderStatus2 equals 35) || (orderStatus2 equals 13) || (orderStatus2 equals 15)) {
buffer1.update(1,orderAcceptNum1 + orderAcceptNum2)
}
//拒单数
if (orderStatus2 equals 3) {
buffer1.update(2,orderRejectNum1 + orderRejectNum2)
}
//过检订单数28, 32, 33, 34, 35
if ((orderStatus2 equals 28) || (orderStatus2 equals 32) || (orderStatus2 equals 33) || (orderStatus2 equals 34) || (orderStatus2 equals 35)) {
buffer1.update(3,orderPassNum1 + orderPassNum2)
}
}
override def evaluate(buffer: Row): Any = {
val orderNum = buffer.getInt(0)
val orderAcceptNum = buffer.getInt(1)
val orderRejectNum = buffer.getInt(2)
val orderPassNum = buffer.getInt(3)
val orderPassRate = orderPassNum.toDouble / orderNum.toDouble
val result = s"${orderNum}_${orderAcceptNum}_${orderRejectNum}_${orderPassNum}_${orderPassRate}"
//println(result)
//拒单 status_name = 拒单
result
}
}
}
我们说,UDAF分为update,merge,evaluate三大板块
这个UDAF的错误在于:merge不是一个可分布的算法
如果update的最后一条,,不是28, 32, 33, 34, 35,,那么进入这个merge,即使里面有orderPassNum,也直接忽略,不进入求和,,,因为orderStatus2里面是放的每个update的最后一条的信息。。
预期是求各状态的和,但是代码是:根据最后一条状态聚合,这不是一个可分布的算法。
这也解释了,,你指定Status时,数据很正常。。。
修改之后的merge部分代码:
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val orderNum1 = buffer1.getInt(0)
val orderAcceptNum1 = buffer1.getInt(1)
val orderRejectNum1 = buffer1.getInt(2)
val orderPassNum1 = buffer1.getInt(3)
val orderNum2 = buffer2.getInt(0)
val orderAcceptNum2 = buffer2.getInt(1)
val orderRejectNum2 = buffer2.getInt(2)
val orderPassNum2 = buffer2.getInt(3)
buffer1.update(0, orderNum1 + orderNum2)
buffer1.update(1,orderAcceptNum1 + orderAcceptNum2)
buffer1.update(2,orderRejectNum1 + orderRejectNum2)
buffer1.update(3,orderPassNum1 + orderPassNum2)
}
done!