官网上介绍的自定义聚合例子
http://spark.apache.org/docs/latest/sql-getting-started.html#aggregations
SparkSQL的聚合函数支持count()
, countDistinct()
, avg()
, max()
, min()
等,但是肯定不满足,所以需要用户自定义函数
第一种 弱类型定义 extend UserDefinedAggregateFunction 比较麻烦
第二种 强类型定义 extends Aggregator,推荐
不支持sql查询方式,即 select udf(name) from table
将数据转化为DataSet,然后注册聚合函数,使用非SQL查询方式,即df.select(udf("name"))
下面是强类型例子
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator
object UDFAverage{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("demo")
val spark = SparkSession.builder().config(conf).getOrCreate()
val sc =spark.sparkContext
sc.setLogLevel("ERROR")
import spark.implicits._
val ds = spark.read.json("resources/people.json").as[Inner]
ds.show
val udaf = new UDFClass
val udf = udaf.toColumn.name("udf123")
val result = ds.select(udf)
result.show
}
}
//输入的类型
case class Inner(name:String, age:Long)
//缓冲区内合并类型
case class BuffSum(var sum:Long, var count:Double)
//输出类型
case class Outer(name:String, age:Long, score:Long)
class UDFClass extends Aggregator[Inner,BuffSum,Outer]{
override def zero: BuffSum = {
//缓冲区对象 初始化
BuffSum(0,0)
}
override def reduce(b: BuffSum, a: Inner): BuffSum = {
//自定义的聚合函数 所以肯定要处理全局变量
b.sum = b.sum + a.age
b.count = b.count + 1
b
}
override def merge(b1: BuffSum, b2: BuffSum): BuffSum = {
//合并每个缓冲区内的BuffSum 因为是分布式的
b1.sum = b1.sum + b2.sum
b1.count = b1.count + b2.count
b1
}
override def finish(reduction: BuffSum): Outer = {
Outer("test",reduction.sum,reduction.sum)
}
//通常是固定类型
override def bufferEncoder: Encoder[BuffSum] = Encoders.product
//通常是固定类型
override def outputEncoder: Encoder[Outer] = Encoders.product
}
输出结果是这样
+---+-------+-----+
|age| name|score|
+---+-------+-----+
| 22|Michael| 14|
| 30| Andy| 36|
| 18| Justin| 16|
+---+-------+-----+
+----+---+-----+
|name|age|score|
+----+---+-----+
|test| 70| 70|
+----+---+-----+
如果Outer不定义结构,只输出一个数字,那么列的名字就是 udf123
val udf = udaf.toColumn.name("udf123")
例子
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator
object UDFAverage{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("demo")
val spark = SparkSession.builder().config(conf).getOrCreate()
val sc =spark.sparkContext
sc.setLogLevel("ERROR")
import spark.implicits._
val ds = spark.read.json("resources/people.json").as[Inner]
ds.show
val udaf = new UDFClass
val udf = udaf.toColumn.name("udf123")
val result = ds.select(udf)
result.show
}
}
//输入的类型
case class Inner(name:String, age:Long)
//缓冲区内合并类型
case class BuffSum(var sum:Long, var count:Double)
//输出类型
case class Outer(name:String, age:Long)
class UDFClass extends Aggregator[Inner,BuffSum,Double]{
override def zero: BuffSum = {
//缓冲区对象 初始化
BuffSum(0,0)
}
override def reduce(b: BuffSum, a: Inner): BuffSum = {
//自定义的聚合函数 所以肯定要处理全局变量
b.sum = b.sum + a.age
b.count = b.count + 1
b
}
override def merge(b1: BuffSum, b2: BuffSum): BuffSum = {
//合并每个缓冲区内的BuffSum 因为是分布式的
b1.sum = b1.sum + b2.sum
b1.count = b1.count + b2.count
b1
}
override def finish(reduction: BuffSum): Double = {
//Outer("test",reduction.sum)
reduction.sum
}
override def bufferEncoder: Encoder[BuffSum] = Encoders.product
override def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
结果是这种
+---+-------+-----+
|age| name|score|
+---+-------+-----+
| 22|Michael| 14|
| 30| Andy| 36|
| 18| Justin| 16|
+---+-------+-----+
+------+
|udf123|
+------+
| 70.0|
+------+
spark不支持换行读取json,如果想要换行的话,需要读取整个文件,然后转换成json对象
people.json
{"name":"Michael","age": 22,"score":14}
{"name":"Andy", "age":30,"score":36}
{"name":"Justin", "age":18,"score":16}