FlinkSql自定义函数使用
默认分为三种:
UDF:也叫标量函数,输入一行返回一行
UDAF:也叫表值函数,输入一行返回多行
UDTF:比较特殊,官方分为聚合函数和表聚合函数两种。前一种实现输入多行返回一行,后一种输入多行返回一张表。
接下来通过三个案例来实现自定义函数
UDF
需求:随机输入10个国内IP,解析成“省份-城市”
import java.util.ArrayList;
import java.util.Random;
public class GetRandomIp {
/**
* 随机生成10个国内IP地址
*/
public static ArrayList<String> getRandomIp(){
//ip范围
int[][] range = {{607649792,608174079},//36.56.0.0-36.63.255.255
{1038614528,1039007743},//61.232.0.0-61.237.255.255
{1783627776,1784676351},//106.80.0.0-106.95.255.255
{2035023872,2035154943},//121.76.0.0-121.77.255.255
{2078801920,2079064063},//123.232.0.0-123.235.255.255
{-1950089216,-1948778497},//139.196.0.0-139.215.255.255
{-1425539072,-1425014785},//171.8.0.0-171.15.255.255
{-1236271104,-1235419137},//182.80.0.0-182.92.255.255
{-770113536,-768606209},//210.25.0.0-210.47.255.255
{-569376768,-564133889}, //222.16.0.0-222.95.255.255
};
Random rdint = new Random();
int index = rdint.nextInt(10);
ArrayList<String> list = new ArrayList<>();
for(int i=0;i<10;i++){
String ip = num2ip(range[index][0]+new Random().nextInt(range[index][1]-range[index][0]));
list.add(ip);
}
return list;
}
/**
* 将十进制转换成ip地址
*/
private static String num2ip(int ip) {
int [] b=new int[4] ;
String x = "";
b[0] = (int)((ip >> 24) & 0xff);
b[1] = (int)((ip >> 16) & 0xff);
b[2] = (int)((ip >> 8) & 0xff);
b[3] = (int)(ip & 0xff);
x=Integer.toString(b[0])+"."+Integer.toString(b[1])+"."+Integer.toString(b[2])+"."+Integer.toString(b[3]);
return x;
}
}
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.net.HttpURLConnection;
import java.net.URL;
public class IPpaserUtil implements Serializable {
/**
* 通过调用接口根据ip获取归属地
*/
public static String getAddress(String ip) {
try {
URL realUrl = new URL("http://whois.pconline.com.cn/ipJson.jsp?ip=" + ip + "&json=true");
HttpURLConnection conn = (HttpURLConnection) realUrl.openConnection();
conn.setRequestMethod("GET");
conn.setUseCaches(false);
conn.setReadTimeout(6000);
conn.setConnectTimeout(6000);
conn.setInstanceFollowRedirects(false);
int code = conn.getResponseCode();
StringBuilder sb = new StringBuilder();
String addr = "";
if (code == 200) {
InputStream in = conn.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in, "GBK"));//指定编码格式
String line;
while ((line = reader.readLine()) != null) {
sb.append(line);
}
JSONObject jsonObject = JSON.parseObject(String.valueOf(sb));
String pro = jsonObject.getString("pro".trim());
String city = jsonObject.getString("city".trim());
String s = pro+"-"+city;
addr = s;
}
return addr;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
import java.text.SimpleDateFormat;
import java.util.Date;
//工具类:实现字符串转时间戳类型
public class StringTimeToTimeStamp {
public static Long strtime_timestamp(String times) {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
Date date = format.parse(times); // 将时间字符串转换为Date类型
Long timestamp = date.getTime(); // 获取时间戳
return timestamp;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
import com.utils.GetRandomIp;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
/**
* udf:标量函数,输入一行返回一行
* 随机输入10个ip解析:省份-城市
*/
public class UdfApp {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableenv = StreamTableEnvironment.create(env);
SingleOutputStreamOperator<String> source = env.fromCollection(GetRandomIp.getRandomIp())
.map(new MapFunction<String, String>() {
@Override
public String map(String value) throws Exception {
return value.toString().trim();
}
});
tableenv.createTemporaryView("model",source,$("ip"));
tableenv.createTemporaryFunction("ip_parser",new UdfWithIpPaser());
Table table = tableenv.sqlQuery("select ip, ip_parser(ip) from model");
tableenv.toAppendStream(table, Row.class).print();
env.execute();
}
}
import com.utils.IPpaserUtil;
import org.apache.flink.table.functions.ScalarFunction;
public class UdfWithIpPaser extends ScalarFunction{
public String eval(String ip){
return IPpaserUtil.getAddress(ip);
}
}
UDTF
需求:传入一个json串,将json串解析成两个字段
数据格式:
{"userID": "user_5", "eventTime": "2019-12-01 10:02:00", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:02", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:10", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:12", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:15", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:16", "eventType": "browse", "productID": "product_5", "productPrice": 20}
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
/**
* udtf:表值函数,输入一行通过拆分返回多列数据
*
* 传入一个json,通过表值函数解析成表字段
*/
public class UdtfApp {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableenv = StreamTableEnvironment.create(env);
SingleOutputStreamOperator<Tuple4<String, String, String, Integer>> source
= env.readTextFile("/Users/IdeaProjects/Test/Flink_Project/Flink_SQL/src/main/java/com/data/1.txt")
.map(new MapFunction<String, Tuple4<String, String, String, Integer>>() {
@Override
public Tuple4<String, String, String, Integer> map(String value) throws Exception {
JSONObject json = JSON.parseObject(value);
String userid = json.getString("userID");
String times = json.getString("eventTime");
String productid = json.getString("productID");
Integer price = Integer.parseInt(json.getString("productPrice"));
return Tuple4.of(userid, times, productid, price);
}
});
tableenv.createTemporaryView("model",source,$("userid"),$("times"),$("productid"),$("price"));
tableenv.createTemporaryFunction("split_filed",new UdtfWithFiledPaser());
Table table = tableenv.sqlQuery("select times,days,hours from model, lateral table(split_filed(times)) as t(days,hours)");
tableenv.toAppendStream(table, Row.class).print();
env.execute();
}
}
import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
public class UdtfWithFiledPaser extends TableFunction<Row> {
/**
* 自动类型推导输入参数类型和输出结果类型->DataTypeHint
* @param value
*/
@DataTypeHint("ROW<s String,t String>")
public void eval(String value){
String[] split = value.split(" ");
Row row = new Row(2);
row.setField(0,split[0]);
row.setField(1,split[1]);
collect(row);
}
}
UDAF
需求:实现五秒窗口内平均每人消费水平
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.lxf.utils.StringTimeToTimeStamp;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
/**
* udaf:多行输入一行输出,聚合函数,分为聚合函数和表聚合函数
* 聚合函数:继承AggregateFunction,把一个表(一行或者多行,每行可以有一列或者多列)聚合成一个标量值
* 表聚合函数:TableAggregateFunction,把一个表(一行或者多行,每行有一列或者多列)聚合成另一张表,结果中可以有多行多列
* 计算5秒窗口内平均每人消费情况
*/
public class UdafApp {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableenv = StreamTableEnvironment.create(env);
env.setParallelism(1);
SingleOutputStreamOperator<Tuple5<String, Long, String, String, Double>> source
= env.readTextFile("/Users/IdeaProjects/Test/Flink_Project/Flink_SQL/src/main/java/com/data/1.txt")
.map(new MapFunction<String, Tuple5<String, Long, String, String, Double>>() {
@Override
public Tuple5<String, Long, String, String, Double> map(String value) throws Exception {
JSONObject json = JSON.parseObject(value);
String userid = json.getString("userID");
Long timestamps = new StringTimeToTimeStamp().strtime_timestamp(json.getString("eventTime"));
String times = json.getString("eventTime");
String productid = json.getString("productID");
Double price = Double.parseDouble(json.getString("productPrice"));
return Tuple5.of(userid, timestamps, times, productid, price);
}
});
SingleOutputStreamOperator<Tuple5<String, Long, String, String, Double>> stream
= source.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Tuple5<String, Long, String, String, Double>>(Time.seconds(0)) {
@Override
public long extractTimestamp(Tuple5<String, Long, String, String, Double> value) {
return value.f1;
}
});
/**
* 在创建临时试图的时候需要注意两点
* 1、流中元素位置要匹配,必须一一对应,中间不能少字段
* 2、rowtime可以指定流中time或者timestamp类型字段,且位置也要对应。这里的rowtime实际上就是事件时间
*/
tableenv.createTemporaryView("model",stream,$("userid"),$("timestamps").rowtime(),$("times"),$("productid"),$("price"));
tableenv.createTemporaryFunction("avg_price",new UdafWithSum());
String sql = "select " +
"TUMBLE_START(timestamps,interval '5' second) as win_start," +
"TUMBLE_END(timestamps,interval '5' second) as win_end," +
"sum(price) as moneys, " +
"count(userid) as counts, " +
"avg_price(price) as avgs " +
"from model group by TUMBLE(timestamps,interval '5' second)";
Table table = tableenv.sqlQuery(sql);
tableenv.toRetractStream(table, Row.class).filter(x->x.f0).print();
env.execute();
}
}
import org.apache.flink.table.functions.AggregateFunction;
import java.util.Iterator;
public class UdafWithSum extends AggregateFunction<Double, UdafWithSum.AvgAccum> {
@Override
public Double getValue(AvgAccum accum) {
if (accum.counts == 0) {
return null;
}else{
return accum.sumprice * 1D / accum.counts;
}
}
@Override
public AvgAccum createAccumulator() {
AvgAccum accum = new AvgAccum();
accum.sumprice = 0D;
accum.counts = 0;
return accum;
}
public static class AvgAccum{
public Double sumprice;
public Integer counts;
}
/* @FunctionHint(
//accumulator = @DataTypeHint(bridgedTo = AvgAccum.class),
input = @DataTypeHint("Integer"),
output = @DataTypeHint("Double")
)*/
/**
* 实现累加器
* @param accum 聚合结果
* @param v1 用户定义的输入值
*/
public void accumulate(AvgAccum accum,Double v1){
accum.sumprice += v1;
accum.counts += 1;
}
// 在 bounded OVER 窗口中是必须实现的
public void retract(AvgAccum accum,Double v1){
accum.sumprice -= v1;
accum.counts -= 1;
}
//在批式聚合和会话以及滚动窗口聚合中是必须实现的
public void merge(AvgAccum accum,Iterable<AvgAccum> it){
Iterator<AvgAccum> iter = it.iterator();
while (iter.hasNext()) {
AvgAccum a = iter.next();
accum.sumprice += accum.sumprice;
accum.counts += accum.counts;
}
}
}
总结
三种自定义函数实现方式官网有对应模版,这里不在强调对应写法。但在编写自定义函数时有几点注意事项:
1、数据类型推导
Flink 自定义函数实现了自动的类型推导提取,通过反射从函数的类及其求值方法中派生数据类型。如果这种隐式的反射提取方法不成功,则可以通过使用 @DataTypeHint 和 @FunctionHint 注解相关参数、类或方法来支持提取过程。可以看到我们在udtf中就使用了自动类型推导的方式。
2、作用范围
UDF就是对字段进行转换,UDTF可以作用于一些复杂解析中,比如json、xml等。它实现的是行转列。UDAF作为聚合函数常常和窗口一起配合使用,因为在流处理里面全局聚合往往没有太大意义。
3、打包问题
在生产中使用,一般不会直接在代码里写好SQL语句。都是集成在第三方的开发平台上进行SQL逻辑编写。这时我们定义好自定义函数代码逻辑并本地验证没有问题后,需要进行打包上传至第三方平台。那么在打包的过程中需要注意不要带上flink相关依赖,第三方平台有自己的一套flink运行环境,如果直接打包运行函数会导致依赖冲突。而非flink依赖则需要全部打包上去。