flink 读取 iceberg upsert数据流:
读取iceberg数据
// 创建执行环境envE
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
// 创建hadoop-catalog
Map<String, String> properties = new HashMap<>();
properties.put("type", "iceberg");
properties.put("catalog-type", "hadoop");
properties.put("property-version", "1");
properties.put("warehouse", MyContent.WARE_HOUSE);
CatalogLoader hadoop_catalog = CatalogLoader.hadoop("hadoop_catalog", new Configuration(), properties);
// 获取表 0002
TableIdentifier tableIdentifier = TableIdentifier.of(Namespace.of("iceberg"), "table_b");
Table tableSource = hadoop_catalog.loadCatalog().loadTable(tableIdentifier);
TableLoader tableLoader = TableLoader.fromCatalog(hadoop_catalog, tableIdentifier);
// 打开upsert 流
// TableOperations operations = ((BaseTable) tableSource).operations();
// TableMetadata current = operations.current();
// operations.commit(current,current.upgradeToFormatVersion(2));
DataStream<RowData> build = FlinkSource.forRowData().env(env).tableLoader(tableLoader).streaming(true)
.build();
写入 iceberg表
// 获取表定义
TableIdentifier tableIdentifierDes = TableIdentifier.of(Namespace.of("iceberg"), "table_b_new");
// 获取表
org.apache.iceberg.Table tableDes = hadoop_catalog.loadCatalog().loadTable(tableIdentifierDes);
TableLoader tableLoaderDes = TableLoader.fromCatalog(hadoop_catalog, tableIdentifierDes);
// 开启upsert流
TableOperations operations = ((BaseTable) tableDes).operations();
TableMetadata current = operations.current();
operations.commit(current,current.upgradeToFormatVersion(2));
// 输出到iceberg
FlinkSink.forRowData(build).tableLoader(tableLoaderDes).distributionMode(DistributionMode.HASH)
.writeParallelism(1).equalityFieldColumns(Arrays.asList("id")).build();
读取kafka upsert 流写入 iceberg没问题
package com.zhengtx.java.consumer;
import com.zhengtx.java.MyContent;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.data.RowData;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.BaseTable;
import org.apache.iceberg.DistributionMode;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.flink.CatalogLoader;
import org.apache.iceberg.flink.TableLoader;
import org.apache.iceberg.flink.sink.FlinkSink;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
public class KafkaConsumerToIceberg {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
tableEnv.executeSql("CREATE TABLE table_a(\n" +
" id int,\n" +
" name string\n" +
") WITH (\n" +
" 'connector' = 'kafka',\n" +
" 'topic' = 'topic_a',\n" +
" 'properties.bootstrap.servers' = 'cdh1:9092',\n" +
" 'properties.group.id' = 'testGroup',\n" +
" 'format' = 'canal-json',\n" +
" 'scan.startup.mode' = 'earliest-offset'\n" +
")\n");
Table tableSource = tableEnv.sqlQuery("select * from table_a");
// DataStream<Tuple2<Boolean, Row>> tuple2DataStream = tableEnv.toRetractStream(table, Row.class);
// tuple2DataStream.print();
DataStream<Tuple2<Boolean, RowData>> tuple2DataStream = tableEnv.toRetractStream(tableSource, RowData.class);
SingleOutputStreamOperator<RowData> input = tuple2DataStream.map(new MapFunction<Tuple2<Boolean, RowData>, RowData>() {
@Override
public RowData map(Tuple2<Boolean, RowData> booleanRowDataTuple2) throws Exception {
return booleanRowDataTuple2.f1;
}
});
Map<String,String> properties = new HashMap<String,String>();
properties.put("type", "iceberg");
properties.put("catalog-type", "hadoop");
properties.put("property-version", "1");
properties.put("warehouse", MyContent.WARE_HOUSE);
// 获取 catalog
CatalogLoader hadoop_catalog = CatalogLoader.hadoop("hadoop_catalog", new Configuration(), properties);
// 获取表定义
TableIdentifier tableIdentifierDes = TableIdentifier.of(Namespace.of("iceberg"), "table_a");
// 获取表
org.apache.iceberg.Table tableDes = hadoop_catalog.loadCatalog().loadTable(tableIdentifierDes);
TableLoader tableLoader = TableLoader.fromCatalog(hadoop_catalog, tableIdentifierDes);
// 开启upsert流
TableOperations operations = ((BaseTable) tableDes).operations();
TableMetadata current = operations.current();
operations.commit(current,current.upgradeToFormatVersion(2));
// 输出到iceberg
FlinkSink.forRowData(input).tableLoader(tableLoader).distributionMode(DistributionMode.HASH)
.writeParallelism(1).equalityFieldColumns(Arrays.asList("id")).build();
env.execute("kafka topic to iceberg table_a");
}
}
读取 iceberg 流式 insert 数据 没问题
读取 iceberg 流式 delete 数据 报错信息
正常启动,然后读取到delete流
ava.lang.UnsupportedOperationException: Found overwrite operation, cannot support incremental data in snapshots (3432724924746171576, 4677707268119051509]
at org.apache.iceberg.IncrementalDataTableScan.snapshotsWithin(IncrementalDataTableScan.java:123)
at org.apache.iceberg.IncrementalDataTableScan.planFiles(IncrementalDataTableScan.java:73)
at org.apache.iceberg.BaseTableScan.planTasks(BaseTableScan.java:244)
at org.apache.iceberg.DataTableScan.planTasks(DataTableScan.java:28)
at org.apache.iceberg.flink.source.FlinkSplitGenerator.tasks(FlinkSplitGenerator.java:86)
at org.apache.iceberg.flink.source.FlinkSplitGenerator.createInputSplits(FlinkSplitGenerator.java:38)
at org.apache.iceberg.flink.source.StreamingMonitorFunction.monitorAndForwardSplits(StreamingMonitorFunction.java:143)
at org.apache.iceberg.flink.source.StreamingMonitorFunction.run(StreamingMonitorFunction.java:121)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:100)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:63)
at org.apache.flink.streaming.runtime.tasks.SourceStreamTask$LegacySourceFunctionThread.run(SourceStreamTask.java:213)
启动的时候读取到delete流
2021-07-21 00:25:07
com.esotericsoftware.kryo.KryoException: java.lang.UnsupportedOperationException
Serialization trace:
columnSizes (org.apache.iceberg.GenericDeleteFile)
deletes (org.apache.iceberg.BaseFileScanTask)
fileScanTask (org.apache.iceberg.BaseFileScanTask$SplitScanTask)
tasks (org.apache.iceberg.BaseCombinedScanTask)
task (org.apache.iceberg.flink.source.FlinkInputSplit)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:125)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:528)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:679)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ObjectArraySerializer.read(DefaultArraySerializers.java:378)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ObjectArraySerializer.read(DefaultArraySerializers.java:289)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:679)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:106)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:528)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:679)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:106)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:528)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:679)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ObjectArraySerializer.read(DefaultArraySerializers.java:378)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ObjectArraySerializer.read(DefaultArraySerializers.java:289)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:679)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:106)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:528)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:679)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:106)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:528)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:761)
at org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer.deserialize(KryoSerializer.java:346)
at org.apache.flink.streaming.runtime.streamrecord.StreamElementSerializer.deserialize(StreamElementSerializer.java:205)
at org.apache.flink.streaming.runtime.streamrecord.StreamElementSerializer.deserialize(StreamElementSerializer.java:46)
at org.apache.flink.runtime.plugable.NonReusingDeserializationDelegate.read(NonReusingDeserializationDelegate.java:55)
at org.apache.flink.runtime.io.network.api.serialization.NonSpanningWrapper.readInto(NonSpanningWrapper.java:335)
at org.apache.flink.runtime.io.network.api.serialization.SpillingAdaptiveSpanningRecordDeserializer.readNonSpanningRecord(SpillingAdaptiveSpanningRecordDeserializer.java:108)
at org.apache.flink.runtime.io.network.api.serialization.SpillingAdaptiveSpanningRecordDeserializer.getNextRecord(SpillingAdaptiveSpanningRecordDeserializer.java:85)
at org.apache.flink.streaming.runtime.io.StreamTaskNetworkInput.emitNext(StreamTaskNetworkInput.java:146)
at org.apache.flink.streaming.runtime.io.StreamOneInputProcessor.processInput(StreamOneInputProcessor.java:67)
at org.apache.flink.streaming.runtime.tasks.StreamTask.processInput(StreamTask.java:351)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:185)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:569)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:534)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:721)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:546)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.UnsupportedOperationException
at java.util.Collections$UnmodifiableMap.put(Collections.java:1457)
at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:144)
at com.esotericsoftware.kryo.serializers.MapSerializer.read(MapSerializer.java:21)
at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:679)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:106)
... 36 more
结论
iceberg0.11 版本目前支持 upsert流写入 支持批任务 读取 upsert流 不支持流任务 读取 upsert流