数据湖目前最大的问题是缺少数据治理能力,像元数据管理,表管理等能力都没有,一般这种管理需要通过web应用进行管理。通过可视化进行管理,目前像iceberg,huidi和delta.io目前主流主要这三种表管理格式,只有lceberg提供java API进行表的创建,修改展示,也可以通过java写入数据和查询数据。既然是一种开放的表管理格式,那就不应该依赖hadoop,hive,spark,flink等这些组件。直接用java,go,python等语言可以操作,不应该过度依赖,这一点iceberg做的很好,目标是做一个标准,不是像hudi做成一个数据库,脱离了对数据湖,开放表格式的意义。目前delta.io已经支持iceberg的标准。相信后面数据湖的格式一定是实现统一的标准格式。
下面具体介绍以下基于Iceberg1.4.2的java 操作,通过java 创建表,操作表,写入数据和查询数据。对于组件的依赖版本不同,调试很长时间,解决了版本依赖。这里面有spark,iceberg,minio等。这里主要使用的java api.直接复制下面的依赖,可以把spark去掉。
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.13</artifactId>
<version>3.5.0</version>
</dependency>
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-spark_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-core -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-core</artifactId>
<version>1.4.2</version>
</dependency>
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.5.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-s3 -->
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>1.12.620</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-api</artifactId>
<version>1.4.2</version>
</dependency>
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-bundled-guava</artifactId>
<version>1.4.2</version>
</dependency>
<!-- 其他依赖库 -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.15.3</version>
</dependency>
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-common</artifactId>
<version>1.4.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-parquet -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-parquet</artifactId>
<version>1.4.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-aws -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-aws</artifactId>
<version>1.4.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-common -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-common</artifactId>
<version>1.4.2</version>
<scope>runtime</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-arrow -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-arrow</artifactId>
<version>1.4.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-data -->
<dependency>
<groupId>org.apache.iceberg</groupId>
<artifactId>iceberg-data</artifactId>
<version>1.4.2</version>
</dependency>
针对操作首先需要创建,这个案例采用的是minio 存储,没有用hadoop,配置minio的环境。
Configuration conf = new Configuration();
conf.set("fs.s3a.aws.credentials.provider"," com.amazonaws.auth.InstanceProfileCredentialsProvider,org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,com.amazonaws.auth.EnvironmentVariableCredentialsProvider");
conf.set("fs.s3a.connection.ssl.enabled", "false");
conf.set("fs.s3a.endpoint", "http://127.0.0.1:9000");
conf.set("fs.s3a.access.key", "minioadmin");
conf.set("fs.s3a.secret.key", "minioadmin");
conf.set("fs.s3a.path.style.access", "true");
conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
conf.set("fs.s3a.fast.upload", "true");
String warehousePath = "s3a://test/";//minio bucket 路径
System.out.println(warehousePath);
HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);
System.out.println(catalog.name());
TableIdentifier name = TableIdentifier.of("iceberg_db", "table_ice");
接下来创建表
// 定义表结构schema
Schema schema = new Schema(
//Types.NestedField.required(1, "level", Types.StringType.get()),
//Types.NestedField.required(2, "event_time", Types.TimestampType.withZone()),
//Types.NestedField.required(3, "message", Types.StringType.get()),
//Types.NestedField.optional(4, "call_stack", Types.ListType.ofRequired(5, Types.StringType.get()))
Types.NestedField.required(1, "id", Types.IntegerType.get()),
Types.NestedField.required(2, "name", Types.StringType.get()),
Types.NestedField.required(3, "birth", Types.StringType.get())
);
// 分区定义(以birth字段按月进行分区)
PartitionSpec spec = PartitionSpec.builderFor(schema)
.identity("id")
.build();
// 数据库名,表名
HadoopTables tables = new HadoopTables();
// TableIdentifier name1 = TableIdentifier.of("iceberg_db1", "developer2");
// 表的属性
// Map<String, String> properties = new HashMap<String, String>();
//properties.put("engine.hive.enabled", "true");
// 建表
// Table table = catalog.createTable(name, schema, spec, properties);
// System.out.println("创建遍完成");
// 建数据库,没有就创建有就不管
if (!catalog.namespaceExists(Namespace.of("/iceberg_db1"))) {
catalog.createNamespace(Namespace.of("/iceberg_db1"));
}
Table table = null;
// 判断表存在不存在
if(!catalog.tableExists(name)) {
table = catalog.createTable(name, schema, spec);
}else{
System.out.println("表已经存在");
table=catalog.loadTable(name);
}
接下来,通过java向iceberg写入数据
public void javaCatalogAppend(Schema schema,Table table)throws IOException {
//1,构建表,构建插入数据
GenericRecord record = GenericRecord.create(schema);
ImmutableList.Builder<GenericRecord> builder = ImmutableList.builder();
builder.add(record.copy(ImmutableMap.of("id", 1, "name", "liuyang", "birth", "2020-03-08")));
builder.add(record.copy(ImmutableMap.of("id", 2, "name", "chengx", "birth", "2021-03-09")));
ImmutableList<GenericRecord> records = builder.build();
// 2. 将记录写入parquet文件
System.out.println("table.location(): " + table.location());
String filepath = table.location() + "/" + UUID.randomUUID().toString();
OutputFile file = table.io().newOutputFile(filepath);
DataWriter<GenericRecord> dataWriter =
Parquet.writeData(file)
.schema(schema)
.createWriterFunc(GenericParquetWriter::buildWriter)
.overwrite()
.withSpec(PartitionSpec.unpartitioned())
.build();
try {
dataWriter.write(records);
} finally {
dataWriter.close();
}
// 3. 将文件写入table中
DataFile dataFile = dataWriter.toDataFile();
table.newAppend().appendFile(dataFile).commit();
}
通过Java查询数据CatalogScan
private void javaCatalogScan(Table table) {
IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);
CloseableIterable<Record> records = scanBuilder.build();
for (Record r : records) {
System.out.print(r.get(0));
System.out.print("|");
System.out.print(r.get(1));
System.out.print("|");
System.out.print(r.get(2));
System.out.println();
}
}
附带完整例子
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.catalog.Namespace;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.data.IcebergGenerics;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.parquet.GenericParquetWriter;
import org.apache.iceberg.hadoop.HadoopCatalog;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.DataWriter;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.types.Types;
import java.io.IOException;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.UUID;
public class icebergapi {
public static void main(String[] args)
throws IOException, NoSuchAlgorithmException, InvalidKeyException {
Configuration conf = new Configuration();
conf.set("fs.s3a.aws.credentials.provider"," com.amazonaws.auth.InstanceProfileCredentialsProvider,org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,com.amazonaws.auth.EnvironmentVariableCredentialsProvider");
conf.set("fs.s3a.connection.ssl.enabled", "false");
conf.set("fs.s3a.endpoint", "http://127.0.0.1:9000");
conf.set("fs.s3a.access.key", "minioadmin");
conf.set("fs.s3a.secret.key", "minioadmin");
conf.set("fs.s3a.path.style.access", "true");
conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem");
conf.set("fs.s3a.fast.upload", "true");
String warehousePath = "s3a://test/";//minio bucket 路径
System.out.println(warehousePath);
HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);
System.out.println(catalog.name());
TableIdentifier name = TableIdentifier.of("iceberg_db", "table_ice");
// 定义表结构schema
Schema schema = new Schema(
//Types.NestedField.required(1, "level", Types.StringType.get()),
//Types.NestedField.required(2, "event_time", Types.TimestampType.withZone()),
//Types.NestedField.required(3, "message", Types.StringType.get()),
//Types.NestedField.optional(4, "call_stack", Types.ListType.ofRequired(5, Types.StringType.get()))
Types.NestedField.required(1, "id", Types.IntegerType.get()),
Types.NestedField.required(2, "name", Types.StringType.get()),
Types.NestedField.required(3, "birth", Types.StringType.get())
);
// 分区定义(以birth字段按月进行分区)
PartitionSpec spec = PartitionSpec.builderFor(schema)
.identity("id")
.build();
// 数据库名,表名
HadoopTables tables = new HadoopTables();
// TableIdentifier name1 = TableIdentifier.of("iceberg_db1", "developer2");
// 表的属性
// Map<String, String> properties = new HashMap<String, String>();
//properties.put("engine.hive.enabled", "true");
// 建表
// Table table = catalog.createTable(name, schema, spec, properties);
// System.out.println("创建遍完成");
// 建数据库,没有就创建有就不管
if (!catalog.namespaceExists(Namespace.of("/iceberg_db1"))) {
catalog.createNamespace(Namespace.of("/iceberg_db1"));
}
Table table = null;
// 判断表存在不存在
if(!catalog.tableExists(name)) {
table = catalog.createTable(name, schema, spec);
}else{
System.out.println("表已经存在");
table=catalog.loadTable(name);
}
icebergapi api=new icebergapi();
//写入数据
api.javaCatalogAppend(schema,table);
//查询数据
api.javaCatalogScan(table);
}
public void javaCatalogAppend(Schema schema,Table table)throws IOException {
//1,构建表,构建插入数据
GenericRecord record = GenericRecord.create(schema);
ImmutableList.Builder<GenericRecord> builder = ImmutableList.builder();
builder.add(record.copy(ImmutableMap.of("id", 1, "name", "liuyang", "birth", "2020-03-08")));
builder.add(record.copy(ImmutableMap.of("id", 2, "name", "chengx", "birth", "2021-03-09")));
ImmutableList<GenericRecord> records = builder.build();
// 2. 将记录写入parquet文件
System.out.println("table.location(): " + table.location());
String filepath = table.location() + "/" + UUID.randomUUID().toString();
OutputFile file = table.io().newOutputFile(filepath);
DataWriter<GenericRecord> dataWriter =
Parquet.writeData(file)
.schema(schema)
.createWriterFunc(GenericParquetWriter::buildWriter)
.overwrite()
.withSpec(PartitionSpec.unpartitioned())
.build();
try {
dataWriter.write(records);
} finally {
dataWriter.close();
}
// 3. 将文件写入table中
DataFile dataFile = dataWriter.toDataFile();
table.newAppend().appendFile(dataFile).commit();
}
private void javaCatalogScan(Table table) {
IcebergGenerics.ScanBuilder scanBuilder = IcebergGenerics.read(table);
CloseableIterable<Record> records = scanBuilder.build();
for (Record r : records) {
System.out.print(r.get(0));
System.out.print("|");
System.out.print(r.get(1));
System.out.print("|");
System.out.print(r.get(2));
System.out.println();
}
}
}