SparkSQL自定义聚合函数

官网上介绍的自定义聚合例子

http://spark.apache.org/docs/latest/sql-getting-started.html#aggregations

SparkSQL的聚合函数支持count(), countDistinct(), avg(), max(), min()等,但是肯定不满足,所以需要用户自定义函数

第一种 弱类型定义  extend  UserDefinedAggregateFunction  比较麻烦

第二种 强类型定义 extends Aggregator,推荐

不支持sql查询方式,即 select udf(name) from table

 将数据转化为DataSet,然后注册聚合函数,使用非SQL查询方式,即df.select(udf("name"))

下面是强类型例子

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator

object UDFAverage{
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("demo")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc =spark.sparkContext
    sc.setLogLevel("ERROR")
    import spark.implicits._

    val ds = spark.read.json("resources/people.json").as[Inner]
    ds.show
    val udaf = new UDFClass
    val udf = udaf.toColumn.name("udf123")
    val result = ds.select(udf)
    result.show
  }

}
//输入的类型
case class Inner(name:String, age:Long)
//缓冲区内合并类型
case class BuffSum(var sum:Long, var count:Double)
//输出类型
case class Outer(name:String, age:Long, score:Long)
class UDFClass extends Aggregator[Inner,BuffSum,Outer]{
  override def zero: BuffSum = {
    //缓冲区对象 初始化
    BuffSum(0,0)
  }

  override def reduce(b: BuffSum, a: Inner): BuffSum = {
    //自定义的聚合函数  所以肯定要处理全局变量
    b.sum = b.sum + a.age
    b.count = b.count + 1
    b
  }

  override def merge(b1: BuffSum, b2: BuffSum): BuffSum = {
    //合并每个缓冲区内的BuffSum  因为是分布式的
    b1.sum = b1.sum + b2.sum
    b1.count = b1.count + b2.count
    b1
  }

  override def finish(reduction: BuffSum): Outer = {
    Outer("test",reduction.sum,reduction.sum)
  }

  //通常是固定类型
  override def bufferEncoder: Encoder[BuffSum] = Encoders.product
  //通常是固定类型
  override def outputEncoder: Encoder[Outer] = Encoders.product
}


输出结果是这样
+---+-------+-----+
|age|   name|score|
+---+-------+-----+
| 22|Michael|   14|
| 30|   Andy|   36|
| 18| Justin|   16|
+---+-------+-----+

+----+---+-----+
|name|age|score|
+----+---+-----+
|test| 70|   70|
+----+---+-----+

如果Outer不定义结构,只输出一个数字,那么列的名字就是 udf123

val udf = udaf.toColumn.name("udf123")

例子

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator

object UDFAverage{
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("demo")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc =spark.sparkContext
    sc.setLogLevel("ERROR")
    import spark.implicits._

    val ds = spark.read.json("resources/people.json").as[Inner]
    ds.show
    val udaf = new UDFClass
    val udf = udaf.toColumn.name("udf123")
    val result = ds.select(udf)
    result.show
  }

}
//输入的类型
case class Inner(name:String, age:Long)
//缓冲区内合并类型
case class BuffSum(var sum:Long, var count:Double)
//输出类型
case class Outer(name:String, age:Long)
class UDFClass extends Aggregator[Inner,BuffSum,Double]{
  override def zero: BuffSum = {
    //缓冲区对象 初始化
    BuffSum(0,0)
  }

  override def reduce(b: BuffSum, a: Inner): BuffSum = {
    //自定义的聚合函数  所以肯定要处理全局变量
    b.sum = b.sum + a.age
    b.count = b.count + 1
    b
  }

  override def merge(b1: BuffSum, b2: BuffSum): BuffSum = {
    //合并每个缓冲区内的BuffSum  因为是分布式的
    b1.sum = b1.sum + b2.sum
    b1.count = b1.count + b2.count
    b1
  }

  override def finish(reduction: BuffSum): Double = {
    //Outer("test",reduction.sum)
    reduction.sum
  }

  override def bufferEncoder: Encoder[BuffSum] = Encoders.product

  override def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}


结果是这种
+---+-------+-----+
|age|   name|score|
+---+-------+-----+
| 22|Michael|   14|
| 30|   Andy|   36|
| 18| Justin|   16|
+---+-------+-----+

+------+
|udf123|
+------+
|  70.0|
+------+

 

spark不支持换行读取json,如果想要换行的话,需要读取整个文件,然后转换成json对象

people.json   

{"name":"Michael","age": 22,"score":14}
{"name":"Andy", "age":30,"score":36}
{"name":"Justin", "age":18,"score":16}

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值