flink使用countWindow，往gp中批量写数据，并自定义超时触发器

最新推荐文章于 2024-09-11 17:12:09 发布

普罗米修斯之火

最新推荐文章于 2024-09-11 17:12:09 发布

阅读量2k

点赞数

文章标签： flink big data kafka

本文链接：https://blog.csdn.net/WuBoooo/article/details/121336158

版权

设置超时触发时间类型为ProcessingTime

        env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);

主流程代码

dataStream
    .keyBy(r -> r.f1)//按表名进行keyBy，避免同一张表的数据传入出现先后问题
    .timeWindow(Time.of(10, TimeUnit.SECONDS))//超时时间10秒
    .trigger(new CountTriggerWithTimeout(500, TimeCharacteristic.ProcessingTime))//500条数据一个窗口
    .process(new CdcMyProcessWindowFunction())//将同一时间窗口内的数据收集到List中
    .addSink(new GreenPlumSink());//输出到GreenPlum

自定义超时触发器

package com.ysservice.dataStreamApi.udfs;

import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.state.ReducingState;
import org.apache.flink.api.common.state.ReducingStateDescriptor;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.triggers.TriggerResult;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;


/**
 * 带超时的计数窗口触发器
 */
public class CountTriggerWithTimeout<T> extends Trigger<T, TimeWindow> {


    /**
     * 窗口最大数据量
     */
    private int maxCount;
    /**
     * event time / process time
     */
    private TimeCharacteristic timeType;
    /**
     * 用于储存窗口当前数据量的状态对象
     */
    private ReducingStateDescriptor<Long> countStateDescriptor =
            new ReducingStateDescriptor("counter", new Sum(), LongSerializer.INSTANCE);




    public CountTriggerWithTimeout(int maxCount, TimeCharacteristic timeType) {


        this.maxCount = maxCount;
        this.timeType = timeType;
    }




    private TriggerResult fireAndPurge(TimeWindow window, TriggerContext ctx) throws Exception {
        clear(window, ctx);
        return TriggerResult.FIRE_AND_PURGE;
    }




    @Override
    public TriggerResult onElement(T element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
        ReducingState<Long> countState = ctx.getPartitionedState(countStateDescriptor);
        countState.add(1L);


        if (countState.get() >= maxCount) {
            return fireAndPurge(window, ctx);
        }
        if (timestamp >= window.getEnd()) {
            return fireAndPurge(window, ctx);
        } else {
            return TriggerResult.CONTINUE;
        }
    }


    @Override
    public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
        if (timeType != TimeCharacteristic.ProcessingTime) {
            return TriggerResult.CONTINUE;
        }


        if (time >= window.getEnd()) {
            return TriggerResult.CONTINUE;
        } else {
            return fireAndPurge(window, ctx);
        }
    }


    @Override
    public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
        if (timeType != TimeCharacteristic.EventTime) {
            return TriggerResult.CONTINUE;
        }


        if (time >= window.getEnd()) {
            return TriggerResult.CONTINUE;
        } else {
            return fireAndPurge(window, ctx);
        }
    }


    @Override
    public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
        ReducingState<Long> countState = ctx.getPartitionedState(countStateDescriptor);
        countState.clear();
    }


    /**
     * 计数方法
     */
    class Sum implements ReduceFunction<Long> {


        @Override
        public Long reduce(Long value1, Long value2) throws Exception {
            return value1 + value2;
        }
    }
}

将同一时间窗口内的数据收集到List中

package com.ysservice.dataStreamApi.udfs;

import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.shaded.guava18.com.google.common.collect.Lists;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.List;

/**
 * @Description:将同一时间窗口内的数据收集到List中
 * @author: WuBo
 * @date:2021/11/12 17:21
 */
public class CdcMyProcessWindowFunction extends ProcessWindowFunction<Tuple3<Boolean, String, Row>, List<Tuple3<Boolean, String, Row>>, String, TimeWindow> {
    @Override
    public void process(String s, Context context, Iterable<Tuple3<Boolean, String, Row>> elements, Collector<List<Tuple3<Boolean, String, Row>>> out) throws Exception {
        ArrayList<Tuple3<Boolean, String, Row>> sqlArr = Lists.newArrayList(elements);
        if (sqlArr.size() > 0) {
            out.collect(sqlArr);
            sqlArr.clear();
        }
    }
}

自定义gpSink，通过批量写入的方式，往gp中写入数据

package com.ysservice.dataStreamApi.sink;

import com.ysservice.dataStreamApi.utils.GreenplumUtil;
import com.ysservice.dataStreamApi.utils.RegexUtils;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.types.Row;
import java.lang.reflect.Field;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;
import java.util.Set;

/**
 * @Description: GreenPlum Sink
 * @author: WuBo
 * @date:2021/11/8 14:09
 */
public class GreenPlumSink extends RichSinkFunction<List<Tuple3<Boolean, String, Row>>> {

    Connection connection;
    Statement statement;

    //任务开始时执行一次，用于获取Greenplum的链接
    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        connection = GreenplumUtil.getConnection();
        statement = connection.createStatement();
    }

    // 每条记录插入时调用一次
    public void invoke(List<Tuple3<Boolean, String, Row>> value, SinkFunction.Context context) throws Exception {
        if (value.size() > 0) {
            for (Tuple3<Boolean, String, Row> tuple3 : value) {
                if (tuple3.f0) {
                    String insertSql = insertSql(tuple3.f2.getFieldNames(true), tuple3);
                    statement.addBatch(insertSql);
                } else {
                    String deleteSql = deleteSql(tuple3.f2.getFieldNames(true), tuple3);
                    statement.addBatch(deleteSql);
                }
            }
            try {
                statement.executeBatch();
            } catch (SQLException throwables) {
                executeErrorSql(value);
            }
        }
        value.clear();
    }

    public void executeErrorSql(List<Tuple3<Boolean, String, Row>> errorTuple3List) throws Exception {

        if (errorTuple3List.size() > 0) {
            for (Tuple3<Boolean, String, Row> tuple3 : errorTuple3List) {
                if (tuple3.f0) {
                    String deleteSql = deleteSql(tuple3.f2.getFieldNames(true), tuple3);
                    try {
                        statement.execute(deleteSql);  //如果执行删除报错，就将错误的sql写入错误日志表
                    } catch (SQLException e) {
                        statement.execute("insert into cdc_log.cdc_error_sql(error_sql) values ('" + deleteSql + "')");
                    }
                    String insertSql = insertSql(tuple3.f2.getFieldNames(true), tuple3);
                    try {
                        statement.execute(insertSql);  //如果执行插入报错，就将错误的sql写入错误日志表
                    } catch (SQLException e) {
                        statement.execute("insert into cdc_log.cdc_error_sql(error_sql) values ('" + insertSql + "')");
                    }
                }else {
                    String deleteSql = deleteSql(tuple3.f2.getFieldNames(true), tuple3);
                    try {
                        statement.execute(deleteSql);  //如果执行删除报错，就将错误的sql写入错误日志表
                    } catch (SQLException e) {
                        statement.execute("insert into cdc_log.cdc_error_sql(error_sql) values ('" + deleteSql + "')");
                    }
                }
            }
        }
        errorTuple3List.clear();
    }

    //sql插入语句
    public String insertValue(Set<String> names, Tuple3<Boolean, String, Row> value) {
        String values = "";//用于拼接数据
        for (String name : names) {//遍历字段，获取字段对于的数据后，拼接字段和数据，最终拼成一个完整的sql
            Object data = value.f2.getField(name);//获得字段对应的数据
            if (data != null) {
                if (RegexUtils.numberRegex(data.toString())) {//判断数据类型是否是数字的
                    data = data.toString();
                } else {
                    data = "'" + RegexUtils.timeStampRegex(data.toString()).replace("'", "''") + "'";//拼sql时如果遇到单引号会报错，将一个单引号变成两个就ok了
                }
            } else {
                data = "null";
            }
            values += data + ",";
        }
        //拼接插入的sql
        return "(" + values.substring(0, (values.length() - 1)) + "),";
    }

    //sql插入语句
    public String insertSql(Set<String> names, Tuple3<Boolean, String, Row> value) {
        String datas = "";//用于拼接数据
        String fields = "";//用于拼接字段
        for (String name : names) {//遍历字段，获取字段对于的数据后，拼接字段和数据，最终拼成一个完整的sql
            Object data = value.f2.getField(name);//获得字段对应的数据
            if (data != null) {
                if (RegexUtils.numberRegex(data.toString())) {//判断数据类型是否是数字的
                    data = data.toString();
                } else {
                    data = "'" + RegexUtils.timeStampRegex(data.toString()).replace("'", "''") + "'";//拼sql时如果遇到单引号会报错，将一个单引号变成两个就ok了
                }
                datas += data + ",";
                fields += name + ",";
            }
        }
        //拼接插入的sql
        String sql = "insert into " + value.f1 + "(" + fields.substring(0, (fields.length() - 1)) + ") values(" + datas.substring(0, (datas.length() - 1)) + ");";
        return sql;
    }
    //sql删除语句
    public String deleteSql(Set<String> names, Tuple3<Boolean, String, Row> value) throws Exception {
        //获得每张表的主键字段，使用主键去删除数据
        Class<?> sourceClass = Class.forName("com.ysservice.dataStreamApi.source." + value.f1);
        Field primaryKey = sourceClass.getDeclaredField("primaryKey");
        String keys = (String) primaryKey.get(sourceClass);
        String[] keyArr = keys.split(",");

        String wheres = "";//用于拼接where sql
        for (String name : keyArr) {
            Object data = value.f2.getField(name);
            if (data != null) {
                if (RegexUtils.numberRegex(data.toString())) {
                    data = data.toString();
                } else {
                    data = "'" + RegexUtils.timeStampRegex(data.toString()).replace("'", "''") + "'";
                }
                wheres += name + "=" + data + " and ";
            }
        }
        //拼接删除的sql
        String sql = "delete from " + value.f1 + " where " + wheres.substring(0, (wheres.length() - 5)) + ";";
        return sql;
    }

    @Override
    public void close() throws Exception {
        super.close();
        if (statement != null) {
            statement.close();
        }
        if (connection != null) {
            connection.close();
        }
    }
}