Flink程序Copy方式写入数据到GreenPlum

背景

Flink CDC整库同步_flink cdc mysql 全量同步-CSDN博客中实现数据同步的基本功能,单如果源端实时数量比较大,那么数据无法在秒级别实现实时入湖,改用copy方式写入数据优化sink算子实现逻辑。

实现

创建自定义数据写入器,使用List集合缓存数据在内存空间中。并根据Flink任务 checkpoint时,使用gp的copymanager将List缓存的数据写入gp中。

import util.JDBCURL;
import util.PropUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.compress.utils.Lists;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.connector.jdbc.catalog.PostgresCatalog;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.util.JSONPObject;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.flink.streaming.api.operators.StreamingRuntimeContext;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.api.Schema;
import org.apache.flink.table.catalog.CatalogBaseTable;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
import org.apache.flink.table.data.RawValueData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.flink.table.data.util.DataFormatConverters;
import org.apache.flink.table.types.AbstractDataType;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.types.Row;
import org.postgresql.PGConnection;
import org.postgresql.copy.CopyManager;
import org.postgresql.jdbc.PgConnection;
 
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.StringBufferInputStream;
import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
import java.sql.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.stream.Collectors;
 
import static org.apache.flink.table.api.DataTypes.FIELD;
import static org.apache.flink.table.api.DataTypes.STRING;
 
@Slf4j
public class DwCopySink extends RichSinkFunction<RowData> implements CheckpointedFunction {
 
    private static final String DEFAULT_NULL_VALUE = "\002";
 
    private static final String DEFAULT_FIELD_DELIMITER = "\001";
 
    // pg 字符串里含有\u0000 会报错 ERROR: invalid byte sequence for encoding "UTF8": 0x00
    public static final String SPACE = "\u0000";
 
    private static final String LINE_DELIMITER = "\n";
 
    protected static final String COPY_SQL_TEMPL =
            "copy %s(%s) from stdin DELIMITER '%s' NULL as '%s'";
 
    private PgConnection connection;
 
    private CopyManager copyManager;
 
    private static PostgresCatalog postgresCatalog;
 
    private static CatalogBaseTable table;
 
 
    private List<RowData> batchList = new ArrayList<>();
 
    private static DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
 
    private String sql;
 
    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
 
 
        String url = PropUtils.getProperties().getProperty("dw_jdbc");
        String username = PropUtils.getProperties().getProperty("dw_user");
        String password = PropUtils.getProperties().getProperty("dw_password");
 
        connection = (PgConnection) DriverManager.getConnection(url, username, password);
 
        buildCopySql();
    }
 
    @Override
    public void invoke(RowData value, Context context) throws Exception {
        super.invoke(value, context);
        batchList.add(value);
 
    }
 
    @Override
    public void finish() throws Exception {
        super.finish();
    }
 
    @Override
    public void close() throws Exception {
        flush();
        super.close();
        if (connection != null) {
            connection.close();
        }
    }
 
    @Override
    public void snapshotState(FunctionSnapshotContext context) throws Exception {
        //每次checkpoint时刷新数据,避免数据重复写入
        flush();
    }
 
    @Override
    public void initializeState(FunctionInitializationContext context) throws Exception {
        //没有初始化动作,无需处理
    }
 
    public static PostgresCatalog getCatalog() {
 
        if (postgresCatalog == null) {
            String url = PropUtils.getProperties().getProperty("dw_jdbc");
            String username = PropUtils.getProperties().getProperty("dw_user");
            String password = PropUtils.getProperties().getProperty("dw_password");
            final Matcher matcher1 = JDBCURL.getPattern("jdbc:postgresql://{host}[:{port}]/[{database}][\\?{params}]")
                    .matcher(url);
            if (matcher1.matches()) {
                String database = matcher1.group("database");
                String host = matcher1.group("host");
                String port = matcher1.group("port");
 
                String source_url = String.format("jdbc:postgresql://%s:%d", host, Integer.parseInt(port));
 
                postgresCatalog = new PostgresCatalog(DwCopySink.class.getClassLoader(), "postgresql",
                        database, username, password, source_url);
            }
        }
        return postgresCatalog;
    }
 
 
    /**
     * 根据要同步的表信息返回row类型
     *
     * @return
     * @throws TableNotExistException
     */
    public RowType getTableRowType() throws TableNotExistException {
 
        Schema schema = getTable().getUnresolvedSchema();
 
        List<LogicalType> logicalTypes = schema.getColumns().stream().map(col ->
        {
            Schema.UnresolvedPhysicalColumn column = (Schema.UnresolvedPhysicalColumn) col;
            return ((DataType) column.getDataType()).getLogicalType();
        }).collect(Collectors.toList());
 
        List<String> nameList = schema.getColumns().stream().map(col -> col.getName()).collect(Collectors.toList());
 
        return RowType.of(logicalTypes.toArray(new LogicalType[logicalTypes.size()]),
                nameList.toArray(new String[nameList.size()]));
    }
 
    public static CatalogBaseTable getTable() throws TableNotExistException {
        if (table == null) {
            String schemaName = PropUtils.getProperties().getProperty("copy_dw_schema");
            String tableName = PropUtils.getProperties().getProperty("copy_dw_table");
 
            table = getCatalog().getTable(new ObjectPath(postgresCatalog.getDefaultDatabase(),
                    schemaName + "." + tableName));
        }
        return table;
    }
 
    private String buildCopySql() throws SQLException, TableNotExistException {
 
        if(StringUtils.isNotEmpty(sql)){
            return sql;
        }
 
        String schemaName = PropUtils.getProperties().getProperty("copy_dw_schema");
        String tableName = PropUtils.getProperties().getProperty("copy_dw_table");
 
        //获取表的基本信息
        Schema schema = getTable().getUnresolvedSchema();
 
        String fieldsExpression = schema.getColumns().stream().map(col -> this.quoteIdentifier(col.getName()))
                .collect(Collectors.joining(", "));
 
        String tableLocation;
        if (schemaName != null && !"".equals(schemaName.trim())) {
            tableLocation = quoteIdentifier(schemaName) + "." + quoteIdentifier(tableName);
        } else {
            tableLocation = quoteIdentifier(tableName);
        }
 
        sql = String.format(
                COPY_SQL_TEMPL, tableLocation, fieldsExpression, DEFAULT_FIELD_DELIMITER, DEFAULT_NULL_VALUE);
 
        return sql;
    }
 
    private String quoteIdentifier(String identifier) {
        return "\"" + identifier + "\"";
    }
 
    /**
     * 刷新数据到数据库中
     *
     * @throws SQLException
     */
    private synchronized void flush() throws SQLException {
 
        if (batchList.size() == 0) {
            return;
        }
 
        long size = batchList.size();
        copyManager = connection.getCopyAPI();
        StopWatch stopWatch = new StopWatch();
        StringBuilder rowsStrBuilder = new StringBuilder(128);
        stopWatch.start();
        for (RowData row : batchList) {
            int lastIndex = row.getArity() - 1;
            StringBuilder rowStr = new StringBuilder(128);
            for (int index = 0; index < row.getArity(); index++) {
                appendColumn(row, index, rowStr, index == lastIndex);
            }
            String tempData = rowStr.toString();
            rowsStrBuilder.append(copyModeReplace(tempData)).append(LINE_DELIMITER);
        }
 
 
        StopWatch stopWatch2 = new StopWatch();
 
        try (ByteArrayInputStream bi =
                     new ByteArrayInputStream(rowsStrBuilder.toString().getBytes(StandardCharsets.UTF_8))) {
            stopWatch.stop();
 
            stopWatch2.start();//开始记录写入数据时间
            copyManager.copyIn(sql, bi);
        } catch (Exception e) {
            log.error(sql);
            log.error("写入数据异常:",e);
            throw new RuntimeException("写入数据异常");
        }
 
 
        stopWatch2.stop();
 
        log.info(String.format("【%d】条数据,构建byte缓冲区花费时间为【%d】  ms",size,stopWatch.getTime()));
        log.info("\n");
 
        long watch2Time = stopWatch2.getTime();
        //log.info(String.format("写入数据的时间%d",watch2Time));
 
        //换算每秒写入数据条数
        BigDecimal decimal = BigDecimal.valueOf(size).divide(BigDecimal.valueOf(watch2Time),3,BigDecimal.ROUND_HALF_EVEN).multiply(BigDecimal.valueOf(1000));
 
        log.info(String.format("写入【%d】条数据,总花费时间为【%d】ms,平均速度%d条每秒",
                size,watch2Time, decimal.intValue()));
        batchList.clear();
    }
 
    private void appendColumn(
            RowData rowData, int pos, StringBuilder rowStr, boolean isLast) {
 
 
        Schema schema = null;
        try {
            schema = getTable().getUnresolvedSchema();
        } catch (TableNotExistException e) {
            throw new RuntimeException(e);
        }
        Schema.UnresolvedPhysicalColumn unresolvedColumn = (Schema.UnresolvedPhysicalColumn) schema.getColumns().get(pos);
 
 
        LogicalType logicalType = ((DataType) unresolvedColumn.getDataType()).getLogicalType();
 
        String col = DEFAULT_NULL_VALUE;
 
        switch (logicalType.getTypeRoot()){
            case CHAR:
            case VARCHAR:
                col = rowData.getString(pos).toString();
                break;
            case BOOLEAN:
                col = String.valueOf(rowData.getBoolean(pos));
                break;
            case DECIMAL:
                col = rowData.getDecimal(pos,32,10).toString();
                break;
            case TINYINT:
            case SMALLINT:
            case INTEGER:
                col = String.valueOf(rowData.getInt(pos));
                break;
            case BIGINT:
                col = String.valueOf(rowData.getLong(pos));
                break;
            case FLOAT:
            case DOUBLE:
                col = String.valueOf(rowData.getDouble(pos));
                break;
            case DATE:
                LocalDateTime localDateTime = rowData.getTimestamp(pos, 6).toLocalDateTime();
                col = localDateTime.format(formatter);
                break;
            case TIME_WITHOUT_TIME_ZONE:
            case TIMESTAMP_WITHOUT_TIME_ZONE:
            case TIMESTAMP_WITH_TIME_ZONE:
            case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
                LocalDateTime dateTime = rowData.getTimestamp(pos, 6).toLocalDateTime();
                col = dateTime.format(formatter);
                break;
            default:
                throw new RuntimeException(String.format("未支持的数据类型 %s %s",unresolvedColumn.getName(),unresolvedColumn.getDataType().toString()));
        }
 
        if (col == null) {
            rowStr.append(DEFAULT_NULL_VALUE);
        } else {
            rowStr.append(col);
        }
        if (!isLast) {
            rowStr.append(DEFAULT_FIELD_DELIMITER);
        }
    }
 
    private String copyModeReplace(String rowStr) {
        if (rowStr.contains("\\")) {
            rowStr = rowStr.replaceAll("\\\\", "\\\\\\\\");
        }
        if (rowStr.contains("\r")) {
            rowStr = rowStr.replaceAll("\r", "\\\\r");
        }
 
        if (rowStr.contains("\n")) {
            rowStr = rowStr.replaceAll("\n", "\\\\n");
        }
 
        // pg 字符串里含有\u0000 会报错 ERROR: invalid byte sequence for encoding "UTF8": 0x00
        if (rowStr.contains(SPACE)) {
            rowStr = rowStr.replaceAll(SPACE, "");
        }
        return rowStr;
    }
 
    public PostgresCatalog getPostgresCatalog() {
        return postgresCatalog;
    }
}
效果

  • 9
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
要使用Flink将批量数据写入Hive中,需要使用Flink的Hive Sink。以下是一些步骤: 1. 确保你的Flink集群和Hive集群在同一个Hadoop集群中,并且Flink和Hive都能够访问同一个Hadoop文件系统。 2. 在Flink应用程序中添加Hive Sink依赖项。例如,如果你正在使用Maven构建项目,可以在pom.xml中添加以下依赖项: ``` <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-hive_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> ``` 3. 创建Hive表并将其注册到Flink中。可以使用Flink的HiveCatalog或HiveMetastoreCatalog。例如,以下是使用HiveMetastoreCatalog注册表的示例代码: ``` String catalogName = "myhive"; // Hive catalog name String databaseName = "mydb"; // Hive database name String tableName = "mytable"; // Hive table name // Create Hive table String createTableDDL = "CREATE TABLE IF NOT EXISTS " + tableName + " (id INT, name STRING) " + " PARTITIONED BY (dt STRING) " + " STORED AS ORC"; hiveClient.execute(createTableDDL); // Register Hive table as Flink table HiveCatalog hiveCatalog = new HiveCatalog(catalogName, databaseName, hiveConfDir); hiveCatalog.open(); TableSchema tableSchema = new TableSchema(new String[]{"id", "name", "dt"}, new TypeInformation[]{Types.INT, Types.STRING, Types.STRING}); HiveTableDescriptor hiveTableDescriptor = new HiveTableDescriptor(hiveCatalog, tableName, tableSchema); hiveTableDescriptor.setPartitionKeys(new String[]{"dt"}); tableEnv.registerTableSource(tableName, hiveTableDescriptor.getTableSource()); tableEnv.registerTableSink(tableName, hiveTableDescriptor.getTableSink()); ``` 4. 将数据写入Hive表。可以使用Flink的DataSet或DataStream API读取批量数据,并使用Flink的Hive Sink将数据写入Hive表。例如,以下是使用DataSet API将数据写入Hive表的示例代码: ``` DataSet<Row> data = ...; // Batch data to be written to Hive table String partitionValue = "20220101"; // Partition value data.addSink( new HiveSink( tableName, new Configuration(), new TableSchema(new String[]{"id", "name", "dt"}, new TypeInformation[]{Types.INT, Types.STRING, Types.STRING}), new String[]{"dt"}, new String[]{partitionValue} ) ); ``` 当你运行Flink应用程序时,数据将被写入Hive表的指定分区中。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

风卷残尘

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值