Flink学习12-FlinkTab&SQL（二）

最新推荐文章于 2024-10-15 22:35:55 发布

40926__

最新推荐文章于 2024-10-15 22:35:55 发布

阅读量234

点赞数

文章标签： flink 学习 sql

本文链接：https://blog.csdn.net/qq_40342691/article/details/133069571

版权

FlinkSql自定义函数使用

默认分为三种：
UDF:也叫标量函数，输入一行返回一行
UDAF:也叫表值函数，输入一行返回多行
UDTF:比较特殊，官方分为聚合函数和表聚合函数两种。前一种实现输入多行返回一行，后一种输入多行返回一张表。
接下来通过三个案例来实现自定义函数

UDF

需求：随机输入10个国内IP，解析成“省份-城市”


import java.util.ArrayList;
import java.util.Random;

public class GetRandomIp {

    /**
     * 随机生成10个国内IP地址
     */
    public static ArrayList<String> getRandomIp(){
        //ip范围
        int[][] range = {{607649792,608174079},//36.56.0.0-36.63.255.255
                {1038614528,1039007743},//61.232.0.0-61.237.255.255
                {1783627776,1784676351},//106.80.0.0-106.95.255.255
                {2035023872,2035154943},//121.76.0.0-121.77.255.255
                {2078801920,2079064063},//123.232.0.0-123.235.255.255
                {-1950089216,-1948778497},//139.196.0.0-139.215.255.255
                {-1425539072,-1425014785},//171.8.0.0-171.15.255.255
                {-1236271104,-1235419137},//182.80.0.0-182.92.255.255
                {-770113536,-768606209},//210.25.0.0-210.47.255.255
                {-569376768,-564133889}, //222.16.0.0-222.95.255.255
        };

        Random rdint = new Random();
        int index = rdint.nextInt(10);
        ArrayList<String> list = new ArrayList<>();
        for(int i=0;i<10;i++){
            String ip = num2ip(range[index][0]+new Random().nextInt(range[index][1]-range[index][0]));
            list.add(ip);
        }

        return list;
    }

    /**
     * 将十进制转换成ip地址
     */
    private static String num2ip(int ip) {
        int [] b=new int[4] ;
        String x = "";

        b[0] = (int)((ip >> 24) & 0xff);
        b[1] = (int)((ip >> 16) & 0xff);
        b[2] = (int)((ip >> 8) & 0xff);
        b[3] = (int)(ip & 0xff);
        x=Integer.toString(b[0])+"."+Integer.toString(b[1])+"."+Integer.toString(b[2])+"."+Integer.toString(b[3]);
        return x;
    }


}


import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.net.HttpURLConnection;
import java.net.URL;

public class IPpaserUtil implements Serializable {

    /**
     * 通过调用接口根据ip获取归属地
     */
    public static String getAddress(String ip) {
        try {
            URL realUrl = new URL("http://whois.pconline.com.cn/ipJson.jsp?ip=" + ip + "&json=true");
            HttpURLConnection conn = (HttpURLConnection) realUrl.openConnection();
            conn.setRequestMethod("GET");
            conn.setUseCaches(false);
            conn.setReadTimeout(6000);
            conn.setConnectTimeout(6000);
            conn.setInstanceFollowRedirects(false);
            int code = conn.getResponseCode();
            StringBuilder sb = new StringBuilder();
            String addr = "";
            if (code == 200) {
                InputStream in = conn.getInputStream();
                BufferedReader reader = new BufferedReader(new InputStreamReader(in, "GBK"));//指定编码格式
                String line;
                while ((line = reader.readLine()) != null) {
                    sb.append(line);
                }
                JSONObject jsonObject = JSON.parseObject(String.valueOf(sb));
                String pro = jsonObject.getString("pro".trim());
                String city = jsonObject.getString("city".trim());
                String s = pro+"-"+city;
                addr = s;

            }
            return addr;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }
}


import java.text.SimpleDateFormat;
import java.util.Date;

//工具类：实现字符串转时间戳类型
public class StringTimeToTimeStamp {
    public static Long strtime_timestamp(String times) {
        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        try {
            Date date = format.parse(times); // 将时间字符串转换为Date类型
            Long timestamp = date.getTime(); // 获取时间戳
            return timestamp;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }
}

import com.utils.GetRandomIp;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

import static org.apache.flink.table.api.Expressions.$;

/**
 * udf:标量函数，输入一行返回一行
 * 随机输入10个ip解析：省份-城市
 */
public class UdfApp {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableenv = StreamTableEnvironment.create(env);

        SingleOutputStreamOperator<String> source = env.fromCollection(GetRandomIp.getRandomIp())
                .map(new MapFunction<String, String>() {
                    @Override
                    public String map(String value) throws Exception {
                        return value.toString().trim();
                    }
                });

        tableenv.createTemporaryView("model",source,$("ip"));
        tableenv.createTemporaryFunction("ip_parser",new UdfWithIpPaser());
        Table table = tableenv.sqlQuery("select ip, ip_parser(ip) from model");
        tableenv.toAppendStream(table, Row.class).print();

        env.execute();
    }
}

import com.utils.IPpaserUtil;
import org.apache.flink.table.functions.ScalarFunction;

public class UdfWithIpPaser extends ScalarFunction{

    public String eval(String ip){
        return IPpaserUtil.getAddress(ip);
    }
}

UDTF

需求：传入一个json串，将json串解析成两个字段
数据格式：

{"userID": "user_5", "eventTime": "2019-12-01 10:02:00", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:02", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:10", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:12", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_5", "eventTime": "2019-12-01 10:02:06", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:15", "eventType": "browse", "productID": "product_5", "productPrice": 20}
{"userID": "user_4", "eventTime": "2019-12-01 10:02:16", "eventType": "browse", "productID": "product_5", "productPrice": 20}


import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

import static org.apache.flink.table.api.Expressions.$;

/**
 * udtf:表值函数，输入一行通过拆分返回多列数据
 *
 * 传入一个json，通过表值函数解析成表字段
 */
public class UdtfApp {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableenv = StreamTableEnvironment.create(env);
        SingleOutputStreamOperator<Tuple4<String, String, String, Integer>> source
                = env.readTextFile("/Users/IdeaProjects/Test/Flink_Project/Flink_SQL/src/main/java/com/data/1.txt")
                .map(new MapFunction<String, Tuple4<String, String, String, Integer>>() {
                    @Override
                    public Tuple4<String, String, String, Integer> map(String value) throws Exception {
                        JSONObject json = JSON.parseObject(value);
                        String userid = json.getString("userID");
                        String times = json.getString("eventTime");
                        String productid = json.getString("productID");
                        Integer price = Integer.parseInt(json.getString("productPrice"));
                        return Tuple4.of(userid, times, productid, price);
                    }
                });
        tableenv.createTemporaryView("model",source,$("userid"),$("times"),$("productid"),$("price"));
        tableenv.createTemporaryFunction("split_filed",new UdtfWithFiledPaser());
        Table table = tableenv.sqlQuery("select times,days,hours from model, lateral table(split_filed(times)) as t(days,hours)");
        tableenv.toAppendStream(table, Row.class).print();

        env.execute();

    }
}


import org.apache.flink.table.annotation.DataTypeHint;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;

public class UdtfWithFiledPaser extends TableFunction<Row> {

    /**
     * 自动类型推导输入参数类型和输出结果类型->DataTypeHint
     * @param value
     */
    @DataTypeHint("ROW<s String,t String>")
    public void eval(String value){
        String[] split = value.split(" ");
        Row row = new Row(2);
        row.setField(0,split[0]);
        row.setField(1,split[1]);
        collect(row);
    }
}

UDAF
需求：实现五秒窗口内平均每人消费水平


import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.lxf.utils.StringTimeToTimeStamp;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

import static org.apache.flink.table.api.Expressions.$;

/**
 * udaf：多行输入一行输出,聚合函数，分为聚合函数和表聚合函数
 * 聚合函数：继承AggregateFunction,把一个表（一行或者多行，每行可以有一列或者多列）聚合成一个标量值
 * 表聚合函数：TableAggregateFunction,把一个表（一行或者多行，每行有一列或者多列）聚合成另一张表，结果中可以有多行多列
 * 计算5秒窗口内平均每人消费情况
 */
public class UdafApp {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableenv = StreamTableEnvironment.create(env);
        env.setParallelism(1);

        SingleOutputStreamOperator<Tuple5<String, Long, String, String, Double>> source
                = env.readTextFile("/Users/IdeaProjects/Test/Flink_Project/Flink_SQL/src/main/java/com/data/1.txt")
                .map(new MapFunction<String, Tuple5<String, Long, String, String, Double>>() {
                    @Override
                    public Tuple5<String, Long, String, String, Double> map(String value) throws Exception {
                        JSONObject json = JSON.parseObject(value);
                        String userid = json.getString("userID");
                        Long timestamps = new StringTimeToTimeStamp().strtime_timestamp(json.getString("eventTime"));
                        String times = json.getString("eventTime");
                        String productid = json.getString("productID");
                        Double price = Double.parseDouble(json.getString("productPrice"));
                        return Tuple5.of(userid, timestamps, times, productid, price);
                    }
                });
        SingleOutputStreamOperator<Tuple5<String, Long, String, String, Double>> stream
                = source.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Tuple5<String, Long, String, String, Double>>(Time.seconds(0)) {
            @Override
            public long extractTimestamp(Tuple5<String, Long, String, String, Double> value) {
                return value.f1;
            }
        });
        /**
         * 在创建临时试图的时候需要注意两点
         * 1、流中元素位置要匹配，必须一一对应，中间不能少字段
         * 2、rowtime可以指定流中time或者timestamp类型字段，且位置也要对应。这里的rowtime实际上就是事件时间
         */
        tableenv.createTemporaryView("model",stream,$("userid"),$("timestamps").rowtime(),$("times"),$("productid"),$("price"));
        tableenv.createTemporaryFunction("avg_price",new UdafWithSum());
        String sql = "select " +
                "TUMBLE_START(timestamps,interval '5' second) as win_start," +
                "TUMBLE_END(timestamps,interval '5' second) as win_end," +
                "sum(price) as moneys, " +
                "count(userid) as counts, " +
                "avg_price(price) as avgs " +
                "from model group by TUMBLE(timestamps,interval '5' second)";
        Table table = tableenv.sqlQuery(sql);
        tableenv.toRetractStream(table, Row.class).filter(x->x.f0).print();

        env.execute();
    }
}


import org.apache.flink.table.functions.AggregateFunction;

import java.util.Iterator;

public  class UdafWithSum extends AggregateFunction<Double, UdafWithSum.AvgAccum> {
    @Override
    public Double getValue(AvgAccum accum) {
        if (accum.counts == 0) {
            return null;
        }else{
            return accum.sumprice * 1D / accum.counts;
        }
    }

    @Override
    public AvgAccum createAccumulator() {
        AvgAccum accum = new AvgAccum();
        accum.sumprice = 0D;
        accum.counts = 0;
        return accum;
    }

    public static class AvgAccum{
        public Double sumprice;
        public Integer counts;
    }


 /*   @FunctionHint(
            //accumulator = @DataTypeHint(bridgedTo = AvgAccum.class),
            input = @DataTypeHint("Integer"),
            output = @DataTypeHint("Double")
    )*/

    /**
     * 实现累加器
     * @param accum 聚合结果
     * @param v1 用户定义的输入值
     */
    public void accumulate(AvgAccum accum,Double v1){
        accum.sumprice += v1;
        accum.counts += 1;
    }

    // 在 bounded OVER 窗口中是必须实现的
    public void retract(AvgAccum accum,Double v1){
        accum.sumprice -= v1;
        accum.counts -= 1;
    }

    //在批式聚合和会话以及滚动窗口聚合中是必须实现的
    public void merge(AvgAccum accum,Iterable<AvgAccum> it){
        Iterator<AvgAccum> iter = it.iterator();
        while (iter.hasNext()) {
            AvgAccum a = iter.next();
            accum.sumprice += accum.sumprice;
            accum.counts += accum.counts;
        }
    }
}

总结

三种自定义函数实现方式官网有对应模版，这里不在强调对应写法。但在编写自定义函数时有几点注意事项：
1、数据类型推导
Flink 自定义函数实现了自动的类型推导提取，通过反射从函数的类及其求值方法中派生数据类型。如果这种隐式的反射提取方法不成功，则可以通过使用 @DataTypeHint 和 @FunctionHint 注解相关参数、类或方法来支持提取过程。可以看到我们在udtf中就使用了自动类型推导的方式。
2、作用范围
UDF就是对字段进行转换，UDTF可以作用于一些复杂解析中，比如json、xml等。它实现的是行转列。UDAF作为聚合函数常常和窗口一起配合使用，因为在流处理里面全局聚合往往没有太大意义。
3、打包问题
在生产中使用，一般不会直接在代码里写好SQL语句。都是集成在第三方的开发平台上进行SQL逻辑编写。这时我们定义好自定义函数代码逻辑并本地验证没有问题后，需要进行打包上传至第三方平台。那么在打包的过程中需要注意不要带上flink相关依赖，第三方平台有自己的一套flink运行环境，如果直接打包运行函数会导致依赖冲突。而非flink依赖则需要全部打包上去。