FlinkSQL 函数详解
自定义函数
除了内置函数之外,FlinkSQL还支持自定义函数,我们可以通过自定义函数来扩展函数的使用FlinkSQL当中自定义函数主要分为四大类:
1.ScalarFunction: 标量函数
特点: 每次只接收一行的数据,输出结果也是 1 行 1列典型的标量函数如: upper(str),lower(str),abs(salary)
2.TableFunction: 表生成函数
特点: 运行时每接收一行数据(一个或多个字段),能产出多行、多列的结果典型的如: explode(),unnest()
3.AggregateFunction: 聚合函数
特点:对输入的数据行 (一组)进行持续的聚合,最终对每组数据输出一行 (多列) 结果典型的如: sum(),max()
4.TableAggregateFunction: 表聚合函数
特点:对输入的数据行(一组)进行持续的聚合,最终对每组数据输出一行或多行(多列)结果
案例1 ScalarFunction
input/userbase.json
{"date_time":"2022-10-04 08:01:48","email":"kyzqcd0686@vjikg.tng","id":0,"name":"郑潮"}
{"date_time":"2022-10-04 08:06:31","email":"bvkqwbmgwi@qq.com","id":1,"name":"图丘喜造"}
{"date_time":"2022-10-04 08:04:39","email":"axvcbj7vbo@ecyi1.4gw","id":2,"name":"王联介"}
{"date_time":"2022-10-04 08:00:19","email":"ew1qu5sunz@caxtg.vtn","id":3,"name":"赖溯"}
{"date_time":"2022-10-04 08:04:51","email":"50xdhnfppw@vwreu.kxk","id":4,"name":"钱泼奎"}
{"date_time":"2022-10-04 08:02:25","email":"h8ist2s54kalorkp,79s","id":5,"name":"紧迟亏"}
{"date_time":"2022-10-04 08:05:22","email":"hnzfdmnjgo@rsiq9.syx","id":6,"name":"贾监"}
{"date_time":"2022-10-04 08:03:53","email":"apjlg5pyuo@hs6l0j4","id":7,"name":"蔡"}
{"date_time":"2022-10-04 08:05:35","email":"cpqofnn5xd@7iknhqc5","id":8,"name":"蔡不"}
{"date_time":"2022-10-04 08:05:03","email":"wg3nfjdv9@fomvu2kb","id":9,"name":"赖妖"}
{"date_time":"2022-10-04 08:06:37","email":"1kkaib5ie@ecvb86cs","id":10,"name":"毛溜拳"}
package org.example.udf;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.table.functions.FunctionContext;
import org.apache.flink.table.functions.ScalarFunction;
public class JsonParseFunction extends ScalarFunction {
@Override
public void open(FunctionContext context) throws Exception {
super.open(context);
}
@Override
public void close() throws Exception {
super.close();
}
public String eval(String jsonLine, String key) {
JSONObject jsonObject = JSONObject.parseObject(jsonLine);
if (jsonObject.containsKey(key)) {
return jsonObject.getString(key);
} else {
return "";
}
}
}
package org.example;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.TableEnvironment;
import org.example.udf.JsonParseFunction;
public class FlinkSqlscalarFunction {
public static void main(String[] args) {
//获取tableEnvironment
EnvironmentSettings environmentSettings = EnvironmentSettings.newInstance().build();
TableEnvironment tableEnvironment = TableEnvironment.create(environmentSettings);
tableEnvironment.executeSql("CREATE TABLE userbase (" +
"line String " +
")WITH(" +
"'connector' = 'filesystem'," +
"'path' = 'input/userbase.json'," +
"'format' = 'raw'" + //定义数据内容为raw,表述一正好数据作为一条
")");
tableEnvironment.sqlQuery("select * from userbase ").execute().print();
tableEnvironment.createTemporarySystemFunction("JsonParse", JsonParseFunction.class);
tableEnvironment.sqlQuery("select JsonParse(line,'date_time'),JsonParse(line,'email'),JsonParse(line,'id'),JsonParse(line,'name') from userbase").execute().print();
}
}
案例2 TableFunction
input/produce_user.json
{"begin_time":1665145907806,"price":258.7,"producetId":920956185,"userBaseList":[{"date_time":"2022-10-07 08:01:48","email":"kyzqcd0686@vjikg.tng","id":0,"name":"郑潮"},{"date_time":"2022-10-07 08:06:31","email":"bvkqwbmgwi@qq.com","id":1,"name":"图丘喜造"},{"date_time":"2022-10-07 08:04:39","email":"axvcbj7vbo@ecyi1.4gw","id":2,"name":"王联介"},{"date_time":"2022-10-07 08:00:19","email":"ew1qu5sunz@caxtg.vtn","id":3,"name":"赖溯"},{"date_time":"2022-10-07 08:04:51","email":"50xdhnfppw@vwreu.kxk","id":4,"name":"钱泼奎"}]}
{"begin_time":1665145915652,"price":258.7,"producetId":-786075263,"userBaseList":[{"date_time":"2022-10-07 08:02:25","email":"h8ist2s54kalorkp,79s","id":5,"name":"紧迟亏"},{"date_time":"2022-10-07 08:05:22","email":"hnzfdmnjgo@rsiq9.syx","id":6,"name":"贾监"},{"date_time":"2022-10-07 08:03:53","email":"apjlg5pyuo@hs6l0j4","id":7,"name":"蔡"},{"date_time":"2022-10-07 08:05:35","email":"cpqofnn5xd@7iknhqc5","id":8,"name":"蔡不"},{"date_time":"2022-10-07 08:05:03","email":"wg3nfjdv9@fomvu2kb","id":9,"name":"赖妖"}]}
{"begin_time":1665145927285,"price":258.7,"producetId":-988723330,"userBaseList":[{"date_time":"2022-10-07 09:01:48","email":"kyzqcd03486@vjikg.tng","id":11,"name":"郑潮1"},{"date_time":"2022-10-07 09:06:31","email":"bvkqwbm1gwi@qq.com","id":12,"name":"图丘喜造1"},{"date_time":"2022-10-07 10:04:39","email":"axvcbj71vbo@ecyi1.4gw","id":13,"name":"王联介3"},{"date_time":"2022-10-07 09:00:19","email":"ew11qu5sunz@caxtg.vtn","id":14,"name":"赖溯1"},{"date_time":"2022-10-07 09:04:51","email":"50xdhnfppw@vwre1u.kxk","id":15,"name":"钱泼奎1"}]}
package org.example.udtf;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.annotation.FunctionHint;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.functions.ScalarFunction;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
public class FlinkSQLTableFunction {
public static void main(String[] args) {
//获取tableEnvironment
EnvironmentSettings environmentSettings = EnvironmentSettings.newInstance().build();
TableEnvironment tableEnvironment = TableEnvironment.create(environmentSettings);
//创建表
tableEnvironment.executeSql("CREATE TABLE userbase (" +
"line String " +
")WITH(" +
"'connector' = 'filesystem'," +
"'path' = 'input/produce_user.json'," +
"'format' = 'raw'" + //定义数据内容为raw,表述一正好数据作为一条
")");
//注册函数
tableEnvironment.createTemporarySystemFunction("JsonFunc",JsonFunction.class);
tableEnvironment.createTemporarySystemFunction("explodeFunc",ExplodeFunc.class);
//逻辑编写
tableEnvironment.sqlQuery("select JsonFunc(line,'begin_time'),JsonFunc(line,'price'),JsonFunc(line,'producetId'),id,name,date_time,email from userbase,lateral table(explodeFunc (line,'userBaseList') )")
.execute().print();
}
/**
*自定义ScalarFunction,实现JSON格式的数据解析
*/
public static class JsonFunction extends ScalarFunction{
public String eval(String jsonLine, String key) {
JSONObject jsonObject = JSONObject.parseObject(jsonLine);
if(jsonObject.containsKey(key)){
return jsonObject.getString(key);
}else {
return "";
}
}
}
//一条JsonArray数据进入,然后解析成为多条数据
@FunctionHint(output = @DataTypeHint("ROW<id String,name String,date_time String,email String>"))
public static class ExplodeFunc extends TableFunction{
public void eval(String line,String key){
JSONObject jsonObject = JSONObject.parseObject(line);
JSONArray jsonArray = jsonObject.getJSONArray(key);
for (int i = 0; i < jsonArray.size(); i++) {
String date_time = jsonArray.getJSONObject(i).getString("date_time");
String id = jsonArray.getJSONObject(i).getString("id");
String name = jsonArray.getJSONObject(i).getString("name");
String email = jsonArray.getJSONObject(i).getString("email");
//使用collect来收集解析出来之后的数据
collect(Row.of(id,name,date_time,email));
}
}
}
}
案例3 AggregateFunction
input/score.csv
1,zhangsan,Chinese,80
1,zhangsan,Math,76
1,zhangsan,Science,84
1,zhangsan,Art,90
2,lisi,Chinese,60
2,lisi,Math,78
2,lisi,Science,86
2,lisi,Art,88
package org.example.udaf;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.functions.AggregateFunction;
public class FlinkSQLAggregrateFunction {
public static void main(String[] args) {
//获取tableEnvironment
EnvironmentSettings environmentSettings = EnvironmentSettings.newInstance().build();
TableEnvironment tableEnvironment = TableEnvironment.create(environmentSettings);
tableEnvironment.executeSql("CREATE TABLE source_score (" +
"id int," +
"name STRING," +
"course STRING," +
"score Double" +
")WITH(" +
"'connector' = 'filesystem'," +
"'path' = 'input/score.csv'," +
"'format' = 'csv'" +
")");
//注册函数
tableEnvironment.createTemporarySystemFunction("AvgFunc",AvgFunc.class);
tableEnvironment.executeSql("select course,AvgFunc(score) as avg_score from source_score group by course").print();
}
public static class AvgFunc extends AggregateFunction<Double, AvgAccumulator> {
@Override
public Double getValue(AvgAccumulator accumulator) {
if (accumulator.count == 0) {
return null;
} else {
return accumulator.sum / accumulator.count;
}
}
@Override
public AvgAccumulator createAccumulator() {
return new AvgAccumulator();
}
public void accumulate(AvgAccumulator acc, Double score) {
acc.setSum(acc.sum + score);
acc.setCount(acc.count + 1);
}
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class AvgAccumulator {
public double sum = 0.0;
public int count = 0;
}
}
pom依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>flinkSql</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<scala.binary.version>2.12</scala.binary.version>
<flink.version>1.14.3</flink.version>
<hadoop.version>3.1.4</hadoop.version>
<hbase.version>2.2.7</hbase.version>
<hive.version>3.1.2</hive.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-csv</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.22</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.73</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.15</version>
</dependency>
<!--flink 读写HDFS需要用到的jar包-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-compatibility_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.5</version>
</dependency>
<!--flink 整合HBASE 需要用到的jar包-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hbase-2.2_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<!--flink 整合kafka需要用到的jar包-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.thrift</groupId>
<artifactId>libfb303</artifactId>
<version>0.9.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.1.1</version>
<configuration>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals><goal>shade</goal></goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.10</version>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals><goal>copy-dependencies</goal></goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<!--<plugin>
<groupId>org.scala.tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>-->
</plugins>
</build>
</project>