hudi 多流拼接使用说明

前言:

         1. 流式jar包任务hudi版本和查询都必须使用 0.12.0-3-tencent版本

         2. 目前source 源支持kafka

         3.  目前hudi分区字段时间戳仅支持毫秒级别的时间戳

具体代码分享:

import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.StatementSet;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.hudi.common.model.PartialUpdateAvroPayload;
import org.apache.hudi.table.marker.SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FlinkPartialUpdateMOR_ka_2 {

    private static final Logger LOG = LoggerFactory.getLogger(FlinkPartialUpdateMOR_ka_2.class);

    private static final String sourceTable1 = "test_kafka_deal";
    private static final String sourceTable2 = "test_kafka_deal_detail";
    private static final String sinkAliasTable1 = "sink_1";
    private static final String sinkAliasTable2 = "sink_2";

    private static final String dbName = "csig_billing_rt_test";
    private static final String targetTable = "test_dwd_order_deal_ri_91";
    private static final String basePath = "hdfs://qy-pcg-8-v3/stage/interface/SNG/g_sng_cloudp_qc_billing_settle/tdw/warehouse/" + dbName + ".db/" + targetTable;
    private static final String metastoreUrl = "thrift://ss-qe-oms.tencent-distribute.com:8096";

    private FlinkPartialUpdateMOR_ka_2() {
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);

        env.setParallelism(1);
        env.enableCheckpointing(1000, CheckpointingMode.EXACTLY_ONCE);

        Configuration configuration = tableEnv.getConfig().getConfiguration();
        configuration.setString("table.dynamic-table-options.enabled", "true");

        LOG.info("sourceTableDDL1 ddl: {}", sourceTableDDL1());
        tableEnv.executeSql(sourceTableDDL1());

        LOG.info("sourceTableDDL2 ddl: {}", sourceTableDDL2());
        tableEnv.executeSql(sourceTableDDL2());
        LOG.info("sinkTableDDL1 ddl: {}", sinkTableDDL1());
        tableEnv.executeSql(sinkTableDDL1());

        LOG.info("sinkTableDDL2 ddl: {}", sinkTableDDL2());
        tableEnv.executeSql(sinkTableDDL2());

        StatementSet statementSet = tableEnv.createStatementSet();
        statementSet.addInsertSql(String.format("insert into %s(dealName, dealId, record_time_deal)\n" +
                        " select dealName,dealId,UNIX_TIMESTAMP(CAST(completionTime AS STRING)) *1000 as record_time_deal  from %s \n",
                sinkAliasTable1, sourceTable1));

        statementSet.addInsertSql(String.format("insert into %s(dealName, shard, record_time_deal_detail)\n" +
                        " select dealName,shard,UNIX_TIMESTAMP(CAST(completionTime AS STRING)) *1000 as record_time_deal_detail  from %s",
                sinkAliasTable2, sourceTable2));

        statementSet.execute();
    }
    public static String sourceTableDDL1() {
        return String.format("create table %s(\n"+
                      "    `dealName`            STRING,\n" +
                      "   `dealId`              BIGINT,\n" +
                      "    `completionTime`      TIMESTAMP(3)\n" +
                      ")\n" +
                      "WITH (\n" +
                      "    'connector' = 'kafka', \n" +
                      "    'format' = 'json',\n" +
                      "    'scan.startup.mode' = 'latest-offset',\n" +
                      "    'properties.bootstrap.servers' = '30.43.230.221:9092',\n" +
                      "    'sink.parallelism' = '8',\n" +
                      "    'topic' = 'test_json_deal_ka',\n" +
                      "    'properties.group.id' = 'consumer_deal'\n" +
                      ")", sourceTable1);
    }
    public static String sourceTableDDL2() {
        return String.format("create table %s(\n"+
                "    `dealName`      STRING,\n" +
                "    `shard`         STRING,\n" +
                "    `completionTime` TIMESTAMP(3)\n" +
                ")\n" +
                "WITH (\n" +
                "    'connector' = 'kafka', \n" +
                "    'format' = 'json',\n" +
                "    'scan.startup.mode' = 'latest-offset',\n" +
                "    'properties.bootstrap.servers' = '30.43.230.221:9092',\n" +
                "    'sink.parallelism' = '8',\n" +
                "    'topic' = 'test_json_deal_detail_ka',\n" +
                "    'properties.group.id' = 'consumer_deal_detail'\n" +
                ")", sourceTable2);
    }

    public static String sinkTableDDL1() {
        return String.format("create table %s(\n"+
            "dealName  STRING,\n" +
                    "     dealId   BIGINT ,\n" +
                    "        shard  STRING ,\n" +
                    "     `record_time_deal`  bigint,\n" +
                    "    `record_time_deal_detail` bigint,\n" +
                    "    PRIMARY KEY (dealName) NOT ENFORCED\n" +
                    ")\n" +
                " PARTITIONED BY (record_time_deal)\n" +
                    " with (\n" +
                    "  'connector' = 'hudi',\n" +
                    "  'path' = '%s',\n" +
                    "  'table.type' = 'MERGE_ON_READ',\n" +
                    "  'write.bucket_assign.tasks' = '15',\n" +
                    "  'write.tasks' = '15',\n" +
                    "  'write.partition.format' = 'yyyyMMdd',\n" +
                    "  'write.partition.timestamp.type' = 'EPOCHMILLISECONDS',\n" +
                    "  'hoodie.bucket.index.num.buckets' = '5',\n" +
                    "  'changelog.enabled' = 'true',\n" +
                    "  'index.type' = 'BUCKET',\n" +
                    "  'write.precombine' = 'true',\n" +
                    "  'hoodie.bucket.index.num.buckets' = '5',\n" +
                    "  'write.precombine.field' = 'record_time_deal:dealId|record_time_deal_detail:shard',"
                + "  'write.payload.class' = '" + PartialUpdateAvroPayload.class.getName() + "',\n"
                + "  'hoodie.write.log.suffix' = 'job1',\n"
                + "  'hoodie.write.concurrency.mode' = 'optimistic_concurrency_control',\n"
                + "  'hoodie.write.lock.provider' = 'org.apache.hudi.client.transaction.lock.HdfsBasedLockProvider',\n"
                + "  'hoodie.cleaner.policy.failed.writes' = 'LAZY',\n"
                + "  'hoodie.cleaner.policy' = 'KEEP_LATEST_BY_HOURS',\n"
                + "  'hoodie.consistency.check.enabled' = 'false',\n"
                + "  'hoodie.write.lock.early.conflict.detection.enable' = 'true',\n"
                + "  'hoodie.write.lock.early.conflict.detection.strategy' = '"
                + SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy.class.getName() + "',\n"
//            + "  'hoodie.logfile.data.block.max.size' = '40',\n"
                + "  'hoodie.keep.min.commits' = '1440',\n"
                + "  'hoodie.keep.max.commits' = '2880',\n"
                + "  'compaction.schedule.enabled'='false',\n"
                + "  'compaction.async.enabled'='false',\n"
                + "  'compaction.trigger.strategy'='num_or_time',\n"
                + "  'compaction.delta_commits' ='5',\n"
                + "  'compaction.delta_seconds' ='300',\n"
                + "  'compaction.max_memory' = '3096',\n"
                + "  'clean.async.enabled' ='false',\n"
                + "  'hive_sync.enable' = 'true',\n"
                + "  'hive_sync.mode' = 'hms',\n"
                + "  'hive_sync.db' = '%s',\n"
                + "  'hive_sync.table' = '%s',\n"
                + "  'hive_sync.metastore.uris' = '%s'\n"
                + ")", sinkAliasTable1, basePath, dbName, targetTable, metastoreUrl);
    }

    public static String sinkTableDDL2() {
        return String.format("create table %s(\n"
                + " dealName  STRING,\n" +
                "     dealId   BIGINT ,\n" +
                "        shard  STRING ,\n" +
                "     `record_time_deal`  bigint,\n" +
                "    `record_time_deal_detail` bigint,\n" +
                "    PRIMARY KEY (dealName) NOT ENFORCED)\n" +
                " PARTITIONED BY (record_time_deal_detail)\n" +
                " with (\n" +
                "  'connector' = 'hudi',\n" +
                "  'path' = '%s',\n" +
                "  'table.type' = 'MERGE_ON_READ',\n" +
                "  'write.bucket_assign.tasks' = '15',\n" +
                "  'write.tasks' = '15',\n" +
                "  'write.partition.format' = 'yyyyMMdd',\n" +
                "  'write.partition.timestamp.type' = 'EPOCHMILLISECONDS',\n" +
                "  'hoodie.bucket.index.num.buckets' = '5',\n" +
                "  'changelog.enabled' = 'true',\n" +
                "  'index.type' = 'BUCKET',\n" +
                "  'write.precombine' = 'true',\n" +
                "  'hoodie.bucket.index.num.buckets' = '5',\n" +
                "  'write.precombine.field' = 'record_time_deal:dealId|record_time_deal_detail:shard',"
                + "  'write.payload.class' = '" + PartialUpdateAvroPayload.class.getName() + "',\n"
                + "  'hoodie.write.log.suffix' = 'job2',\n"
                + "  'hoodie.write.concurrency.mode' = 'optimistic_concurrency_control',\n"
                + "  'hoodie.write.lock.provider' = 'org.apache.hudi.client.transaction.lock.HdfsBasedLockProvider',\n"
                + "  'hoodie.cleaner.policy.failed.writes' = 'LAZY',\n"
                + "  'hoodie.cleaner.policy' = 'KEEP_LATEST_BY_HOURS',\n"
                + "  'hoodie.consistency.check.enabled' = 'false',\n"
                + "  'hoodie.write.lock.early.conflict.detection.enable' = 'true',\n"
                + "  'hoodie.write.lock.early.conflict.detection.strategy' = '"
                + SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy.class.getName() + "',\n"
//            + "  'hoodie.logfile.data.block.max.size' = '40',\n"
                + "  'hoodie.keep.min.commits' = '1440',\n"
                + "  'hoodie.keep.max.commits' = '2880',\n"
                + "  'compaction.schedule.enabled'='false',\n"
                + "  'compaction.async.enabled'='false',\n"
                + "  'compaction.trigger.strategy'='num_or_time',\n"
                + "  'compaction.delta_commits' ='5',\n"
                + "  'compaction.delta_seconds' ='300',\n"
                + "  'clean.async.enabled' ='false',\n"
                + "  'hive_sync.enable' = 'true',\n"
                + "  'hive_sync.mode' = 'hms',\n"
                + "  'hive_sync.db' = '%s',\n"
                + "  'hive_sync.table' = '%s',\n"
                + "  'hive_sync.metastore.uris' = '%s'\n"
                + ")", sinkAliasTable2, basePath, dbName, targetTable, metastoreUrl);
    }

}

设置参数说明:

参数名Required默认值备注
pathRequiredN/A目标表的路径
table.typeOptionalMERGE_ON_READ表的类型,COPY_ON_WRITE or MERGE_ON_READ
write.operationOptionalupsert写入类型,UPSERT or INSERT
write.payload.classRequiredPartialUpdateAvroPayload指定处理数据的Payload,PartialUpdateAvroPayload.class.getName()
write.partition.formatOptionalN/A分区格式,yyyyMMdd按天分区,yyyyMMddHH按小时分区
write.partition.timestamp.typeOptionalN/A分区类型,当分区字段为bigint(long)时使用, 值为: EPOCHMILLISECONDS
write.precombineRequiredtrue
write.precombine.fieldRequiredts_ts1:name`_ts2:age 说明_ts1、_ts2都是排序字段,**冒号**后是需要**更新**的字段,**竖线`用来表示不同流
hoodie.write.log.suffixRequiredlog文件后缀,用来区分不同job
index.typeRequiredFLINK_STATE这里设置为BUCKET
hoodie.bucket.index.num.bucketsRequired256需要根据数据量预估
hoodie.write.concurrency.modeRequiredSINGLE_WRITER设置为optimistic_concurrency_control
hoodie.cleaner.policy.failed.writesRequiredLAZY设置为LAZY
hoodie.write.lock.early.conflict.detection.enableRequiredtrue
hoodie.write.lock.early.conflict.detection.strategyRequired

SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy.class.getName()

 

--验证多流拼接
set `supersql.domain.mapping.jdbc.serverType`=livy;/* 设置数据源类型为Livy */
set `supersql.datasource.default`=hive_online_internal;/* 设置查询使用的系统数据源,里面的hive_online_internal请勿修改 */
set `supersql.bypass.forceAll`=true;/* 设置数据源开启透传方式 */
set `livy.session.cache.enabled`=false;/* 开启缓存 */ 
set `livy.session.conf.spark.yarn.dist.jars`=`hdfs://ss-teg-4-v2/user/tdwadmin/spark/20200609/iceberg-spark3-runtime-tencent.jar,hdfs://ss-teg-4-v2/user/tdwadmin/spark/20220915/hudi-spark3.1-bundle_2.12-0.12.0-3-tencent.jar`;
set `livy.session.conf.spark.sql.extensions`=`org.apache.spark.sql.hudi.HoodieSparkSessionExtension`;
select * from  csig_billing_rt_test.test_dwd_order_deal_ri_91_rt

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Flink使用 Hudi 同时写入多个表,可以按照以下步骤进行: 1. 引入依赖:在 Flink 项目中的 pom.xml 文件中,添加 Hudi 相关的依赖,例如 hudi-flinkhudi-client 等。 2. 创建 Flink 流式作业:使用 Flink 的 DataStream API 创建流式作业,处理输入数据并准备写入 Hudi 表。 3. 配置 Hudi 写入:为每个需要写入的 Hudi 表创建一个 HudiTableConfig 对象,配置相应的表名、Schema、主键等信息。 4. 转换数据并写入 Hudi 表:根据需要写入的多个表,使用 Flink 的 DataStream API 进行数据转换操作,并将数据写入对应的 Hudi 表中。可以使用 `hudiWrite()` 函数将转换后的数据写入 Hudi 表。 ```java dataStream .map(new MyMapper()) // 自定义数据转换逻辑 .addSink(HoodieFlinkSink.create(hudiTableConfig1, ...)) // 将数据写入第一个 Hudi 表 .name("Hudi Sink 1"); dataStream .map(new MyMapper()) // 自定义数据转换逻辑 .addSink(HoodieFlinkSink.create(hudiTableConfig2, ...)) // 将数据写入第二个 Hudi 表 .name("Hudi Sink 2"); // 可以根据需要继续添加更多的 Hudi 表写入操作 ``` 5. 配置并执行 Flink 作业:为 Flink 作业配置必要的参数,例如并行度、检查点等,并执行作业。 ```java StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); // 设置并行度 env.enableCheckpointing(5000); // 开启检查点 // 创建流式作业,处理数据并写入 Hudi 表 env.execute("Flink Hudi Multiple Tables Job"); ``` 需要注意的是,在配置 Hudi 表时,要确保每个表都有唯一的表名和主键,且 Schema 结构与输入数据一致。另外,为了保证数据一致性和容错能力,在 Flink 作业中开启检查点功能是一个好的实践。 以上是一个简单的示例,具体的实现还需要根据你的实际需求和数据处理逻辑进行调整。同时,还需要根据 HudiFlink 的版本进行适配和配置。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值