Iceberg1.4.2 java 表管理（DDL和DML）操作

smileyboy2009

已于 2023-12-25 12:10:43 修改

阅读量711

点赞数 9

文章标签： java 开发语言

于 2023-12-25 11:11:00 首次发布

本文链接：https://blog.csdn.net/smileyboy2009/article/details/135193803

版权

数据湖目前最大的问题是缺少数据治理能力，像元数据管理，表管理等能力都没有，一般这种管理需要通过web应用进行管理。通过可视化进行管理，目前像iceberg，huidi和delta.io目前主流主要这三种表管理格式，只有lceberg提供java API进行表的创建，修改展示，也可以通过java写入数据和查询数据。既然是一种开放的表管理格式，那就不应该依赖hadoop，hive，spark，flink等这些组件。直接用java，go，python等语言可以操作，不应该过度依赖，这一点iceberg做的很好，目标是做一个标准，不是像hudi做成一个数据库，脱离了对数据湖，开放表格式的意义。目前delta.io已经支持iceberg的标准。相信后面数据湖的格式一定是实现统一的标准格式。

下面具体介绍以下基于Iceberg1.4.2的java 操作，通过java 创建表，操作表，写入数据和查询数据。对于组件的依赖版本不同，调试很长时间，解决了版本依赖。这里面有spark，iceberg，minio等。这里主要使用的java api.直接复制下面的依赖，可以把spark去掉。

  <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.13</artifactId>
            <version>3.5.0</version>
        </dependency>
        <dependency>
            <groupId>io.delta</groupId>
            <artifactId>delta-spark_2.12</artifactId>
            <version>3.0.0</version>
        </dependency>
        <!--  https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-core  -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-core</artifactId>
            <version>1.4.2</version>
        </dependency>

        <dependency>
            <groupId>io.minio</groupId>
            <artifactId>minio</artifactId>
            <version>8.5.7</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
            <version>1.12.620</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-aws</artifactId>
            <version>3.2.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-hadoop</artifactId>
            <version>1.13.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-api</artifactId>
            <version>1.4.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-bundled-guava</artifactId>
            <version>1.4.2</version>


        </dependency>
        <!-- 其他依赖库 -->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>2.15.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-common</artifactId>
            <version>1.4.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-parquet -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-parquet</artifactId>
            <version>1.4.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-aws -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-aws</artifactId>
            <version>1.4.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-common -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-common</artifactId>
            <version>1.4.2</version>
            <scope>runtime</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-arrow -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-arrow</artifactId>
            <version>1.4.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-data -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-data</artifactId>
            <version>1.4.2</version>
        </dependency>

针对操作首先需要创建，这个案例采用的是minio 存储，没有用hadoop,配置minio的环境。

  Configuration conf = new Configuration();
        conf.set("fs.s3a.aws.credentials.provider"," com.amazonaws.auth.InstanceProfileCredentialsProvider,org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,com.amazonaws.auth.EnvironmentVariableCredentialsProvider");
        conf.set("fs.s3a.connection.ssl.enabled", "false");
        conf.set("fs.s3a.endpoint", "http://127.0.0.1:9000");
        conf.set("fs.s3a.access.key", "minioadmin");
        conf.set("fs.s3a.secret.key", "minioadmin");
        conf.set("fs.s3a.path.style.access", "true");
        conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
        conf.set("fs.s3a.fast.upload", "true");


        String warehousePath = "s3a://test/";//minio bucket 路径
        System.out.println(warehousePath);
        HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);
        System.out.println(catalog.name());
        TableIdentifier name = TableIdentifier.of("iceberg_db", "table_ice");

接下来创建表

// 定义表结构schema
        Schema schema = new Schema(
                //Types.NestedField.required(1, "level", Types.StringType.get()),
                //Types.NestedField.required(2, "event_time", Types.TimestampType.withZone()),
                //Types.NestedField.required(3, "message", Types.StringType.get()),
                //Types.NestedField.optional(4, "call_stack", Types.ListType.ofRequired(5, Types.StringType.get()))
                Types.NestedField.required(1, "id", Types.IntegerType.get()),
                Types.NestedField.required(2, "name", Types.StringType.get()),
                Types.NestedField.required(3, "birth", Types.StringType.get())
        );

// 分区定义(以birth字段按月进行分区)
        PartitionSpec spec = PartitionSpec.builderFor(schema)

                .identity("id")
                .build();

// 数据库名,表名
        HadoopTables tables = new HadoopTables();
      //  TableIdentifier name1 = TableIdentifier.of("iceberg_db1", "developer2");
// 表的属性
       // Map<String, String> properties = new HashMap<String, String>();
        //properties.put("engine.hive.enabled", "true");
// 建表
       // Table table = catalog.createTable(name, schema, spec, properties);
      //  System.out.println("创建遍完成");
        // 建数据库，没有就创建有就不管
        if (!catalog.namespaceExists(Namespace.of("/iceberg_db1"))) {
            catalog.createNamespace(Namespace.of("/iceberg_db1"));
        }
        Table table = null;
        // 判断表存在不存在
       if(!catalog.tableExists(name)) {
           table = catalog.createTable(name, schema, spec);

       }else{
           System.out.println("表已经存在");
           table=catalog.loadTable(name);
       }

接下来，通过java向iceberg写入数据

public void javaCatalogAppend(Schema schema,Table table)throws IOException {
        //1,构建表，构建插入数据
        GenericRecord record = GenericRecord.create(schema);
        ImmutableList.Builder<GenericRecord> builder = ImmutableList.builder();
        builder.add(record.copy(ImmutableMap.of("id", 1, "name", "liuyang", "birth", "2020-03-08")));
        builder.add(record.copy(ImmutableMap.of("id", 2, "name", "chengx", "birth", "2021-03-09")));
        ImmutableList<GenericRecord> records = builder.build();
        // 2. 将记录写入parquet文件
        System.out.println("table.location(): " + table.location());
        String filepath = table.location() + "/" + UUID.randomUUID().toString();
        OutputFile file = table.io().newOutputFile(filepath);
        DataWriter<GenericRecord> dataWriter =
                Parquet.writeData(file)
                        .schema(schema)
                        .createWriterFunc(GenericParquetWriter::buildWriter)
                        .overwrite()
                        .withSpec(PartitionSpec.unpartitioned())
                        .build();
        try {
            dataWriter.write(records);
        } finally {
            dataWriter.close();
        }

        // 3. 将文件写入table中
        DataFile dataFile = dataWriter.toDataFile();
        table.newAppend().appendFile(dataFile).commit();
    }

通过Java查询数据CatalogScan

private void javaCatalogScan(Table table) {
        IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);

        CloseableIterable<Record> records = scanBuilder.build();

        for (Record r : records) {
            System.out.print(r.get(0));
            System.out.print("|");
            System.out.print(r.get(1));
            System.out.print("|");
            System.out.print(r.get(2));
            System.out.println();
        }
    }

附带完整例子

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.data.IcebergGenerics;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.parquet.GenericParquetWriter;
import org.apache.iceberg.hadoop.HadoopCatalog;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.types.Types;

import java.io.IOException;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.UUID;

public class icebergapi {
    public static void main(String[] args)
            throws IOException, NoSuchAlgorithmException, InvalidKeyException {
        Configuration conf = new Configuration();
        conf.set("fs.s3a.aws.credentials.provider"," com.amazonaws.auth.InstanceProfileCredentialsProvider,org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,com.amazonaws.auth.EnvironmentVariableCredentialsProvider");
        conf.set("fs.s3a.connection.ssl.enabled", "false");
        conf.set("fs.s3a.endpoint", "http://127.0.0.1:9000");
        conf.set("fs.s3a.access.key", "minioadmin");
        conf.set("fs.s3a.secret.key", "minioadmin");
        conf.set("fs.s3a.path.style.access", "true");
        conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
        conf.set("fs.s3a.fast.upload", "true");


        String warehousePath = "s3a://test/";//minio bucket 路径
        System.out.println(warehousePath);
        HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);
        System.out.println(catalog.name());
        TableIdentifier name = TableIdentifier.of("iceberg_db", "table_ice");

        // 定义表结构schema
        Schema schema = new Schema(
                //Types.NestedField.required(1, "level", Types.StringType.get()),
                //Types.NestedField.required(2, "event_time", Types.TimestampType.withZone()),
                //Types.NestedField.required(3, "message", Types.StringType.get()),
                //Types.NestedField.optional(4, "call_stack", Types.ListType.ofRequired(5, Types.StringType.get()))
                Types.NestedField.required(1, "id", Types.IntegerType.get()),
                Types.NestedField.required(2, "name", Types.StringType.get()),
                Types.NestedField.required(3, "birth", Types.StringType.get())
        );

// 分区定义(以birth字段按月进行分区)
        PartitionSpec spec = PartitionSpec.builderFor(schema)

                .identity("id")
                .build();

// 数据库名,表名
        HadoopTables tables = new HadoopTables();
      //  TableIdentifier name1 = TableIdentifier.of("iceberg_db1", "developer2");
// 表的属性
       // Map<String, String> properties = new HashMap<String, String>();
        //properties.put("engine.hive.enabled", "true");
// 建表
       // Table table = catalog.createTable(name, schema, spec, properties);
      //  System.out.println("创建遍完成");
        // 建数据库，没有就创建有就不管
        if (!catalog.namespaceExists(Namespace.of("/iceberg_db1"))) {
            catalog.createNamespace(Namespace.of("/iceberg_db1"));
        }
        Table table = null;
        // 判断表存在不存在
       if(!catalog.tableExists(name)) {
           table = catalog.createTable(name, schema, spec);

       }else{
           System.out.println("表已经存在");
           table=catalog.loadTable(name);
       }
        icebergapi api=new icebergapi();
       //写入数据
        api.javaCatalogAppend(schema,table);
        //查询数据
        api.javaCatalogScan(table);
    }



    public void javaCatalogAppend(Schema schema,Table table)throws IOException {
        //1,构建表，构建插入数据
        GenericRecord record = GenericRecord.create(schema);
        ImmutableList.Builder<GenericRecord> builder = ImmutableList.builder();
        builder.add(record.copy(ImmutableMap.of("id", 1, "name", "liuyang", "birth", "2020-03-08")));
        builder.add(record.copy(ImmutableMap.of("id", 2, "name", "chengx", "birth", "2021-03-09")));
        ImmutableList<GenericRecord> records = builder.build();
        // 2. 将记录写入parquet文件
        System.out.println("table.location(): " + table.location());
        String filepath = table.location() + "/" + UUID.randomUUID().toString();
        OutputFile file = table.io().newOutputFile(filepath);
        DataWriter<GenericRecord> dataWriter =
                Parquet.writeData(file)
                        .schema(schema)
                        .createWriterFunc(GenericParquetWriter::buildWriter)
                        .overwrite()
                        .withSpec(PartitionSpec.unpartitioned())
                        .build();
        try {
            dataWriter.write(records);
        } finally {
            dataWriter.close();
        }

        // 3. 将文件写入table中
        DataFile dataFile = dataWriter.toDataFile();
        table.newAppend().appendFile(dataFile).commit();
    }
    private void javaCatalogScan(Table table) {
        IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);

        CloseableIterable<Record> records = scanBuilder.build();

        for (Record r : records) {
            System.out.print(r.get(0));
            System.out.print("|");
            System.out.print(r.get(1));
            System.out.print("|");
            System.out.print(r.get(2));
            System.out.println();
        }
    }
}