SparkSql的udaf函数,可以实现多进一出的效果,下面我给大家写了一个求平均值的例子
pom如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.wy</groupId>
<artifactId>zoukao2</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.2</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<!-- 编译scala的插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<!-- 编译java的插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 打jar插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
代码如下:
package com.sparksql
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
/**
* SparkSql的udaf函数需要继承UserDefinedAggregateFunction类并实现其中的方法
*/
object UserDefinedFunction2 extends UserDefinedAggregateFunction{
//这个是在指定输入数据的类型,注意顺序
override def inputSchema: StructType = StructType(StructField("salary", LongType) :: Nil)
//这个是函数内部聚合时用的,total是总量,cnt是个数,注意顺序
override def bufferSchema: StructType = StructType(StructField("total",LongType) :: StructField("cnt", LongType) :: Nil)
//指定UDAF函数计算后返回的结果类型
override def dataType: DataType = DoubleType
//确保一致性 一般直接用true就可以,用以标记针对给定的一组输入,UDAF是否总是生成相同的结果
override def deterministic: Boolean = true
//初始化函数,这里初始化的是函数内部聚合时用的变量,也就是前面写的bufferSchema
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = 0L
buffer(1) = 0L
}
//聚合方法,这个方法会被多次调用,用来处理函数接收的数据,buffer是聚合用的变量,input是每行输入的数据
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
//判断一下输入的数据是正常的,有数据的
if (!input.isNullAt(0)) {
buffer(0) = buffer.getLong(0) +input.getLong(0) //这里的取值方法和我们写原生java连接数据库时有些相似,一个意思
buffer(1) = buffer.getLong(1) + 1 //个数加1
}
}
//合并结果方法,因为SparkSql在集群中运行的时候任务会被分配开,最后要合并
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.getLong(0) +buffer2.getLong(0) //所有节点的total合并
buffer1(1) = buffer1.getLong(1) +buffer2.getLong(1) //所有节点的cnt合并
}
//处理最终结果的方法,函数的最终值也在这个方法返回
override def evaluate(buffer: Row): Any = {
buffer.getLong(0).toDouble / buffer.getLong(1)
}
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("DataFrameFromStuctType")
.getOrCreate()
Logger.getLogger("org").setLevel(Level.ERROR)
val empoyeeDF: DataFrame = spark.read.json("C:\\Users\\Desktop\\employees.json")
empoyeeDF.createOrReplaceTempView("employee2")
spark.udf.register("myavg",UserDefinedFunction2)
spark.sql("select myavg(salary) avgsalary from employee2").show()
}
}
应当注意,spark的udf、udaf函数均是由sparksql对象.udf.register
注册