前言:
1. 流式jar包任务hudi版本和查询都必须使用 0.12.0-3-tencent版本
2. 目前source 源支持kafka
3. 目前hudi分区字段时间戳仅支持毫秒级别的时间戳
具体代码分享:
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.StatementSet;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.hudi.common.model.PartialUpdateAvroPayload;
import org.apache.hudi.table.marker.SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FlinkPartialUpdateMOR_ka_2 {
private static final Logger LOG = LoggerFactory.getLogger(FlinkPartialUpdateMOR_ka_2.class);
private static final String sourceTable1 = "test_kafka_deal";
private static final String sourceTable2 = "test_kafka_deal_detail";
private static final String sinkAliasTable1 = "sink_1";
private static final String sinkAliasTable2 = "sink_2";
private static final String dbName = "csig_billing_rt_test";
private static final String targetTable = "test_dwd_order_deal_ri_91";
private static final String basePath = "hdfs://qy-pcg-8-v3/stage/interface/SNG/g_sng_cloudp_qc_billing_settle/tdw/warehouse/" + dbName + ".db/" + targetTable;
private static final String metastoreUrl = "thrift://ss-qe-oms.tencent-distribute.com:8096";
private FlinkPartialUpdateMOR_ka_2() {
}
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
env.setParallelism(1);
env.enableCheckpointing(1000, CheckpointingMode.EXACTLY_ONCE);
Configuration configuration = tableEnv.getConfig().getConfiguration();
configuration.setString("table.dynamic-table-options.enabled", "true");
LOG.info("sourceTableDDL1 ddl: {}", sourceTableDDL1());
tableEnv.executeSql(sourceTableDDL1());
LOG.info("sourceTableDDL2 ddl: {}", sourceTableDDL2());
tableEnv.executeSql(sourceTableDDL2());
LOG.info("sinkTableDDL1 ddl: {}", sinkTableDDL1());
tableEnv.executeSql(sinkTableDDL1());
LOG.info("sinkTableDDL2 ddl: {}", sinkTableDDL2());
tableEnv.executeSql(sinkTableDDL2());
StatementSet statementSet = tableEnv.createStatementSet();
statementSet.addInsertSql(String.format("insert into %s(dealName, dealId, record_time_deal)\n" +
" select dealName,dealId,UNIX_TIMESTAMP(CAST(completionTime AS STRING)) *1000 as record_time_deal from %s \n",
sinkAliasTable1, sourceTable1));
statementSet.addInsertSql(String.format("insert into %s(dealName, shard, record_time_deal_detail)\n" +
" select dealName,shard,UNIX_TIMESTAMP(CAST(completionTime AS STRING)) *1000 as record_time_deal_detail from %s",
sinkAliasTable2, sourceTable2));
statementSet.execute();
}
public static String sourceTableDDL1() {
return String.format("create table %s(\n"+
" `dealName` STRING,\n" +
" `dealId` BIGINT,\n" +
" `completionTime` TIMESTAMP(3)\n" +
")\n" +
"WITH (\n" +
" 'connector' = 'kafka', \n" +
" 'format' = 'json',\n" +
" 'scan.startup.mode' = 'latest-offset',\n" +
" 'properties.bootstrap.servers' = '30.43.230.221:9092',\n" +
" 'sink.parallelism' = '8',\n" +
" 'topic' = 'test_json_deal_ka',\n" +
" 'properties.group.id' = 'consumer_deal'\n" +
")", sourceTable1);
}
public static String sourceTableDDL2() {
return String.format("create table %s(\n"+
" `dealName` STRING,\n" +
" `shard` STRING,\n" +
" `completionTime` TIMESTAMP(3)\n" +
")\n" +
"WITH (\n" +
" 'connector' = 'kafka', \n" +
" 'format' = 'json',\n" +
" 'scan.startup.mode' = 'latest-offset',\n" +
" 'properties.bootstrap.servers' = '30.43.230.221:9092',\n" +
" 'sink.parallelism' = '8',\n" +
" 'topic' = 'test_json_deal_detail_ka',\n" +
" 'properties.group.id' = 'consumer_deal_detail'\n" +
")", sourceTable2);
}
public static String sinkTableDDL1() {
return String.format("create table %s(\n"+
"dealName STRING,\n" +
" dealId BIGINT ,\n" +
" shard STRING ,\n" +
" `record_time_deal` bigint,\n" +
" `record_time_deal_detail` bigint,\n" +
" PRIMARY KEY (dealName) NOT ENFORCED\n" +
")\n" +
" PARTITIONED BY (record_time_deal)\n" +
" with (\n" +
" 'connector' = 'hudi',\n" +
" 'path' = '%s',\n" +
" 'table.type' = 'MERGE_ON_READ',\n" +
" 'write.bucket_assign.tasks' = '15',\n" +
" 'write.tasks' = '15',\n" +
" 'write.partition.format' = 'yyyyMMdd',\n" +
" 'write.partition.timestamp.type' = 'EPOCHMILLISECONDS',\n" +
" 'hoodie.bucket.index.num.buckets' = '5',\n" +
" 'changelog.enabled' = 'true',\n" +
" 'index.type' = 'BUCKET',\n" +
" 'write.precombine' = 'true',\n" +
" 'hoodie.bucket.index.num.buckets' = '5',\n" +
" 'write.precombine.field' = 'record_time_deal:dealId|record_time_deal_detail:shard',"
+ " 'write.payload.class' = '" + PartialUpdateAvroPayload.class.getName() + "',\n"
+ " 'hoodie.write.log.suffix' = 'job1',\n"
+ " 'hoodie.write.concurrency.mode' = 'optimistic_concurrency_control',\n"
+ " 'hoodie.write.lock.provider' = 'org.apache.hudi.client.transaction.lock.HdfsBasedLockProvider',\n"
+ " 'hoodie.cleaner.policy.failed.writes' = 'LAZY',\n"
+ " 'hoodie.cleaner.policy' = 'KEEP_LATEST_BY_HOURS',\n"
+ " 'hoodie.consistency.check.enabled' = 'false',\n"
+ " 'hoodie.write.lock.early.conflict.detection.enable' = 'true',\n"
+ " 'hoodie.write.lock.early.conflict.detection.strategy' = '"
+ SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy.class.getName() + "',\n"
// + " 'hoodie.logfile.data.block.max.size' = '40',\n"
+ " 'hoodie.keep.min.commits' = '1440',\n"
+ " 'hoodie.keep.max.commits' = '2880',\n"
+ " 'compaction.schedule.enabled'='false',\n"
+ " 'compaction.async.enabled'='false',\n"
+ " 'compaction.trigger.strategy'='num_or_time',\n"
+ " 'compaction.delta_commits' ='5',\n"
+ " 'compaction.delta_seconds' ='300',\n"
+ " 'compaction.max_memory' = '3096',\n"
+ " 'clean.async.enabled' ='false',\n"
+ " 'hive_sync.enable' = 'true',\n"
+ " 'hive_sync.mode' = 'hms',\n"
+ " 'hive_sync.db' = '%s',\n"
+ " 'hive_sync.table' = '%s',\n"
+ " 'hive_sync.metastore.uris' = '%s'\n"
+ ")", sinkAliasTable1, basePath, dbName, targetTable, metastoreUrl);
}
public static String sinkTableDDL2() {
return String.format("create table %s(\n"
+ " dealName STRING,\n" +
" dealId BIGINT ,\n" +
" shard STRING ,\n" +
" `record_time_deal` bigint,\n" +
" `record_time_deal_detail` bigint,\n" +
" PRIMARY KEY (dealName) NOT ENFORCED)\n" +
" PARTITIONED BY (record_time_deal_detail)\n" +
" with (\n" +
" 'connector' = 'hudi',\n" +
" 'path' = '%s',\n" +
" 'table.type' = 'MERGE_ON_READ',\n" +
" 'write.bucket_assign.tasks' = '15',\n" +
" 'write.tasks' = '15',\n" +
" 'write.partition.format' = 'yyyyMMdd',\n" +
" 'write.partition.timestamp.type' = 'EPOCHMILLISECONDS',\n" +
" 'hoodie.bucket.index.num.buckets' = '5',\n" +
" 'changelog.enabled' = 'true',\n" +
" 'index.type' = 'BUCKET',\n" +
" 'write.precombine' = 'true',\n" +
" 'hoodie.bucket.index.num.buckets' = '5',\n" +
" 'write.precombine.field' = 'record_time_deal:dealId|record_time_deal_detail:shard',"
+ " 'write.payload.class' = '" + PartialUpdateAvroPayload.class.getName() + "',\n"
+ " 'hoodie.write.log.suffix' = 'job2',\n"
+ " 'hoodie.write.concurrency.mode' = 'optimistic_concurrency_control',\n"
+ " 'hoodie.write.lock.provider' = 'org.apache.hudi.client.transaction.lock.HdfsBasedLockProvider',\n"
+ " 'hoodie.cleaner.policy.failed.writes' = 'LAZY',\n"
+ " 'hoodie.cleaner.policy' = 'KEEP_LATEST_BY_HOURS',\n"
+ " 'hoodie.consistency.check.enabled' = 'false',\n"
+ " 'hoodie.write.lock.early.conflict.detection.enable' = 'true',\n"
+ " 'hoodie.write.lock.early.conflict.detection.strategy' = '"
+ SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy.class.getName() + "',\n"
// + " 'hoodie.logfile.data.block.max.size' = '40',\n"
+ " 'hoodie.keep.min.commits' = '1440',\n"
+ " 'hoodie.keep.max.commits' = '2880',\n"
+ " 'compaction.schedule.enabled'='false',\n"
+ " 'compaction.async.enabled'='false',\n"
+ " 'compaction.trigger.strategy'='num_or_time',\n"
+ " 'compaction.delta_commits' ='5',\n"
+ " 'compaction.delta_seconds' ='300',\n"
+ " 'clean.async.enabled' ='false',\n"
+ " 'hive_sync.enable' = 'true',\n"
+ " 'hive_sync.mode' = 'hms',\n"
+ " 'hive_sync.db' = '%s',\n"
+ " 'hive_sync.table' = '%s',\n"
+ " 'hive_sync.metastore.uris' = '%s'\n"
+ ")", sinkAliasTable2, basePath, dbName, targetTable, metastoreUrl);
}
}
设置参数说明:
参数名 | Required | 默认值 | 备注 | ||
path | Required | N/A | 目标表的路径 | ||
table.type | Optional | MERGE_ON_READ | 表的类型,COPY_ON_WRITE or MERGE_ON_READ | ||
write.operation | Optional | upsert | 写入类型,UPSERT or INSERT | ||
write.payload.class | Required | PartialUpdateAvroPayload | 指定处理数据的Payload,PartialUpdateAvroPayload.class.getName() | ||
write.partition.format | Optional | N/A | 分区格式,yyyyMMdd按天分区,yyyyMMddHH按小时分区 | ||
write.partition.timestamp.type | Optional | N/A | 分区类型,当分区字段为bigint(long)时使用, 值为: EPOCHMILLISECONDS | ||
write.precombine | Required | true | 无 | ||
write.precombine.field | Required | ts | _ts1:name` | _ts2:age 说明_ts1、_ts2都是排序字段,**冒号**后是需要**更新**的字段,**竖线 | `用来表示不同流 |
hoodie.write.log.suffix | Required | 无 | log文件后缀,用来区分不同job | ||
index.type | Required | FLINK_STATE | 这里设置为BUCKET | ||
hoodie.bucket.index.num.buckets | Required | 256 | 需要根据数据量预估 | ||
hoodie.write.concurrency.mode | Required | SINGLE_WRITER | 设置为optimistic_concurrency_control | ||
hoodie.cleaner.policy.failed.writes | Required | LAZY | 设置为LAZY | ||
hoodie.write.lock.early.conflict.detection.enable | Required | true | |||
hoodie.write.lock.early.conflict.detection.strategy | Required | SimpleTransactionDirectMarkerBasedEarlyConflictDetectionStrategy.class.getName() |
--验证多流拼接
set `supersql.domain.mapping.jdbc.serverType`=livy;/* 设置数据源类型为Livy */
set `supersql.datasource.default`=hive_online_internal;/* 设置查询使用的系统数据源,里面的hive_online_internal请勿修改 */
set `supersql.bypass.forceAll`=true;/* 设置数据源开启透传方式 */
set `livy.session.cache.enabled`=false;/* 开启缓存 */
set `livy.session.conf.spark.yarn.dist.jars`=`hdfs://ss-teg-4-v2/user/tdwadmin/spark/20200609/iceberg-spark3-runtime-tencent.jar,hdfs://ss-teg-4-v2/user/tdwadmin/spark/20220915/hudi-spark3.1-bundle_2.12-0.12.0-3-tencent.jar`;
set `livy.session.conf.spark.sql.extensions`=`org.apache.spark.sql.hudi.HoodieSparkSessionExtension`;
select * from csig_billing_rt_test.test_dwd_order_deal_ri_91_rt