Flink Connector 写入 Iceberg 流程源码解析_confluent icebergsinkconnector

先自我介绍一下,小编浙江大学毕业,去过华为、字节跳动等大厂,目前阿里P7

深知大多数程序员,想要提升技能,往往是自己摸索成长,但自己不成体系的自学效果低效又漫长,而且极易碰到天花板技术停滞不前!

因此收集整理了一份《2024年最新大数据全套学习资料》,初衷也很简单,就是希望能够帮助到想自学提升又不知道该从何学起的朋友。
img
img
img
img
img

既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,涵盖了95%以上大数据知识点,真正体系化!

由于文件比较多,这里只是将部分目录截图出来,全套包含大厂面经、学习笔记、源码讲义、实战项目、大纲路线、讲解视频,并且后续会持续更新

如果你需要这些资料,可以添加V获取:vip204888 (备注大数据)
img

正文

  return writerStream;
}

### createStreamWriter 方法



static IcebergStreamWriter createStreamWriter(
Table table,
FlinkWriteConf flinkWriteConf,
RowType flinkRowType,
List equalityFieldIds) {

Table serializableTable = SerializableTable.copyOf(table);
FileFormat format = flinkWriteConf.dataFileFormat();
// 创建 TaskWriterFactory 根据 表的 Schema 创建对应的 Writer
TaskWriterFactory<RowData> taskWriterFactory =
    new RowDataTaskWriterFactory(
        serializableTable,
        flinkRowType,
        flinkWriteConf.targetDataFileSize(),
        format,
        writeProperties(table, format, flinkWriteConf),
        equalityFieldIds,
        flinkWriteConf.upsertMode());
// 新建 IcebergStreamWriter
return new IcebergStreamWriter<>(table.name(), taskWriterFactory);

}


### IcebergStreamWriter 类


该类为一个 Flink 内部的 OneInputStreamOperator 类,拥有 Flink 算子相关特性



class IcebergStreamWriter extends AbstractStreamOperator
implements OneInputStreamOperator<T, WriteResult>, BoundedOneInput {

@Override
public void open() {

// 初始化相关监控类
this.writerMetrics = new IcebergStreamWriterMetrics(super.metrics, fullTableName);

// 初始化 taskWriterFactory 用于创建 writer 
this.taskWriterFactory.initialize(subTaskId, attemptId);

// 创建 writer 
  // 主要分成四类
  // 根据 Iceberg 表是否有分区和开启Upsert模式
  // UnpartitionedWriter : 无分区 insert only 
  // RowDataPartitionedFanoutWriter : 分区 insert only 
  // UnpartitionedDeltaWriter :无分区 Upsert
  // PartitionedDeltaWriter :有分区 Upsert
this.writer = taskWriterFactory.create();

}

@Override
public void processElement(StreamRecord element) throws Exception {
// 处理数据写入
writer.write(element.getValue());
}

// 将本次写入数据文件下发至 Commit 进行统一提交
private void flush() throws IOException {
if (writer == null) {
return;
}

long startNano = System.nanoTime();
WriteResult result = writer.complete();
writerMetrics.updateFlushResult(result);
output.collect(new StreamRecord<>(result));
writerMetrics.flushDuration(TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano));


writer = null;

}

}


### IcebergFilesCommitter 类



class IcebergFilesCommitter extends AbstractStreamOperator
implements OneInputStreamOperator<WriteResult, Void>, BoundedOneInput {


@Override
public void initializeState(StateInitializationContext context) throws Exception {

// 最大连续空提交 
// 在间断指定次数 Checkpoint 都没有数据后才真正触发 Commit,生成 Snapshot。
  // 减少空快照生成
maxContinuousEmptyCommits =
    PropertyUtil.propertyAsInt(table.properties(), MAX\_CONTINUOUS\_EMPTY\_COMMITS, 10);

// 创建 文件输出 OutputFileFactory
this.manifestOutputFileFactory =
    FlinkManifestUtil.createOutputFileFactory(
        table, flinkJobId, operatorUniqueId, subTaskId, attemptId);

if (context.isRestored()) {

// 从状态中恢复未提交的数据文件
NavigableMap<Long, byte[]> uncommittedDataFiles =
Maps.newTreeMap(checkpointsState.get().iterator().next())
.tailMap(maxCommittedCheckpointId, false);
if (!uncommittedDataFiles.isEmpty()) {
// Committed all uncommitted data files from the old flink job to iceberg table.
long maxUncommittedCheckpointId = uncommittedDataFiles.lastKey();
// 如果存在未提交的文件 进行提交
commitUpToCheckpoint(
uncommittedDataFiles, restoredFlinkJobId, operatorUniqueId, maxUncommittedCheckpointId);
}
}
}

@Override
public void snapshotState(StateSnapshotContext context) throws Exception {

  // 将 checkpointId 对应的写入完成的DATA-FILE生成清单文件并放入 dataFilesPerCheckpoint
dataFilesPerCheckpoint.put(checkpointId, writeToManifest(checkpointId));
// Reset the snapshot state to the latest state.
checkpointsState.clear();
  // 存入状态
checkpointsState.add(dataFilesPerCheckpoint);

jobIdState.clear();
jobIdState.add(flinkJobId);

// Clear the local buffer for current checkpoint.
writeResultsOfCurrentCkpt.clear();
committerMetrics.checkpointDuration(
    TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano));

}

@Override
public void notifyCheckpointComplete(long checkpointId) throws Exception {

if (checkpointId > maxCommittedCheckpointId) {
LOG.info(“Checkpoint {} completed. Attempting commit.”, checkpointId);
// 完成 checkpoint 对数据进行 COMMIT
commitUpToCheckpoint(dataFilesPerCheckpoint, flinkJobId, operatorUniqueId, checkpointId);
this.maxCommittedCheckpointId = checkpointId;
} else {
LOG.info(
“Skipping committing checkpoint {}. {} is already committed.”,
checkpointId,
maxCommittedCheckpointId);
}
}

private void commitUpToCheckpoint(
NavigableMap<Long, byte[]> deltaManifestsMap,
String newFlinkJobId,
String operatorId,
long checkpointId)
throws IOException {
NavigableMap<Long, byte[]> pendingMap =
// 获取等待提交的数据文件
deltaManifestsMap.headMap(checkpointId, true);
List manifests = Lists.newArrayList();
NavigableMap<Long, WriteResult> pendingResults = Maps.newTreeMap();
for (Map.Entry<Long, byte[]> e : pendingMap.entrySet()) {
// 数据文件为空则跳过
if (Arrays.equals(EMPTY_MANIFEST_DATA, e.getValue())) {
continue;
}

  DeltaManifests deltaManifests =
      SimpleVersionedSerialization.readVersionAndDeSerialize(
          DeltaManifestsSerializer.INSTANCE, e.getValue());
  pendingResults.put(
      e.getKey(),
      FlinkManifestUtil.readCompletedFiles(deltaManifests, table.io(), table.specs()));
  manifests.addAll(deltaManifests.manifests());
}

// 获取当前待提交文件的 数据条数及数据文件大小 
CommitSummary summary = new CommitSummary(pendingResults);
// 提交数据
commitPendingResult(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);
committerMetrics.updateCommitSummary(summary);
pendingMap.clear();
// 清除已提交数据
deleteCommittedManifests(manifests, newFlinkJobId, checkpointId);

}

private void commitPendingResult(
NavigableMap<Long, WriteResult> pendingResults,
CommitSummary summary,
String newFlinkJobId,
String operatorId,
long checkpointId) {

continuousEmptyCheckpoints = totalFiles == 0 ? continuousEmptyCheckpoints + 1 : 0;
// 数据文件不问 0 或者 连续最大空提交到达了配置的参数阈值触发提交
if (totalFiles != 0 || continuousEmptyCheckpoints % maxContinuousEmptyCommits == 0) {
if (replacePartitions) {
// replace 提交
// 使用 newReplacePartitions()
replacePartitions(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);
} else {
// 普通提交
// 使用 newAppend()
commitDeltaTxn(pendingResults, summary, newFlinkJobId, operatorId, checkpointId);
}
continuousEmptyCheckpoints = 0;

}

private void replacePartitions(
NavigableMap<Long, WriteResult> pendingResults,
CommitSummary summary,
String newFlinkJobId,
String operatorId,
long checkpointId) {
Preconditions.checkState(
summary.deleteFilesCount() == 0, “Cannot overwrite partitions with delete files.”);
// 使用 newReplacePartitions 提交
ReplacePartitions dynamicOverwrite = table.newReplacePartitions().scanManifestsWith(workerPool);
for (WriteResult result : pendingResults.values()) {
Preconditions.checkState(
result.referencedDataFiles().length == 0, “Should have no referenced data files.”);
Arrays.stream(result.dataFiles()).forEach(dynamicOverwrite::addFile);
}

}

private void commitDeltaTxn(
NavigableMap<Long, WriteResult> pendingResults,
CommitSummary summary,
String newFlinkJobId,
String operatorId,
long checkpointId) {
if (summary.deleteFilesCount() == 0) {
// To be compatible with iceberg format V1.
AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool);
for (WriteResult result : pendingResults.values()) {
Preconditions.checkState(
result.referencedDataFiles().length == 0,
“Should have no referenced data files for append.”);
Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);
}
commitOperation(appendFiles, summary, “append”, newFlinkJobId, operatorId, checkpointId);
} else {
// To be compatible with iceberg format V2.

网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。

需要这份系统化的资料的朋友,可以添加V获取:vip204888 (备注大数据)
img

一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!

be compatible with iceberg format V2.

网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。

需要这份系统化的资料的朋友,可以添加V获取:vip204888 (备注大数据)
[外链图片转存中…(img-zy8aJISi-1713122410212)]

一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值