1. 环境准备
确定 cdh 环境,flink集群已经准备好
cdh 6.3.2
flink1.14.4
scala2.11
hive 2.1.1
2. 需要的包
iceberg集成flink包
https://iceberg.apache.org/multi-engine-support/#apache-flink
flink连接hive的包
https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-2.2.0_2.11/1.14.4/
iceberg集成hive的包
https://iceberg.apache.org/multi-engine-support/#apache-flink
3. 创建名为flink-iceberg的maven项目
pom文件如下
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>flink_iceberg</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<flink.version>1.14.4</flink.version>
<scala.binary.version>2.11</scala.binary.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>com.ververica</groupId>
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.68</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<compilerArgs>
<arg>-verbose</arg>
<arg>-Xlint:unchecked</arg>
<arg>-Xlint:deprecation</arg>
<arg>-bootclasspath</arg>
<arg>D:\common_jar\flink-sql-connector-hive-2.2.0_2.11-1.14.4.jar;D:\common_jar\iceberg-flink-runtime-1.14-0.13.2.jar</arg>
<arg>-extdirs</arg>
<arg>${project.basedir}/src/lib</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>
</project>
将HDFS、HIVE的配置文件复制到resource目录下,配置文件可以登录CDH集群进行下载
将iceberg集成flink,flink连接hive的包通过第三方jar的方式引入项目
4. FlinkSQL 程序 写入iceberg
4.1 通过HadoopCatalog 创建表
创建包work.jiang.iceberg.flinksql
在包下创建类FlinkSQLWriteHadoopCatalogIcebergTable
代码如下
package work.jiang.iceberg.flinksql.hadoopcatalog;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
public class FlinkSQLWriteHadoopCatalogIcebergTable {
public static void main(String[] args) throws Exception {
//创建flink执行环境
System.setProperty("HADOOP_USER_NAME", "hdfs");
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings bsSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
env.setParallelism(1);
env.enableCheckpointing(10000);
StreamTableEnvironment tbenv = StreamTableEnvironment.create(env, bsSettings);
// 使用hadoop_catalog的方式创建iceberg表
tbenv.executeSql("CREATE CATALOG hadoop_catalog WITH ( \n" +
" 'type'='iceberg',\n" +
" 'catalog-type'='hadoop', \n" +
" 'warehouse'='hdfs://cdh01:8020/user/iceberg/hadoop_catalog', \n" +
" 'property-version'='1'\n" +
")");
tbenv.executeSql("CREATE DATABASE if not exists iceberg_db");
tbenv.executeSql("CREATE TABLE if not exists hadoop_catalog.iceberg_db.t_iceberg_sample_1 (\n" +
" id BIGINT COMMENT 'unique id',\n" +
" data STRING\n" +
")WITH (\n" +
" 'type'='iceberg',\n" +
" 'catalog-type'='hadoop',\n" +
" 'warehouse'='hdfs://cdh01:8020/user/iceberg/hadoop_catalog/iceberg_db/t_iceberg_sample_1',\n" +
" 'property-version'='1'\n" +
")");
tbenv.executeSql("insert into hadoop_catalog.iceberg_db.t_iceberg_sample_1(id, data) values(10, '2021-04-29 17:38:00')");
tbenv.executeSql("SELECT * FROM hadoop_catalog.iceberg_db.t_iceberg_sample_1");
}
}
直接运行代码,到hdfs文件目录下,会看到表的目录已经创建成功了
表目录里边是data(数据文件)、metadta(元数据文件)、temp(临时文件),目录结构如下图。data数据文件里边是分区(如果建表的时候加分区了),分区里边是具体的parquet文件,iceberg数据默认以parquet格式存储,也可以指定为avro、orc格式。metadata文件里边是Manifest清单文件、Snapshot快照文件、TableMetaData表元数据文件、Current Table Version Pointer描述当前表元数据使用的是哪个版本。
4.2 flinksql批量读取hadoopcatalog 类型的iceberg表
创建类work.jiang.iceberg.flinksql.FlinkSQLButchReadIceberg
代码如下
package work.jiang.iceberg.flinksql.hadoopcatalog;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
public class FlinkSQLReadHadoopCatalogIcebergTable {
public static void main(String[] args) throws Exception {
// 创建flink执行环境
System.setProperty("HADOOP_USER_NAME", "hdfs");
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings bsSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
env.setParallelism(1);
env.enableCheckpointing(10000);
StreamTableEnvironment tbenv = StreamTableEnvironment.create(env, bsSettings);
// 查询的时候,需要创建CATALOG
tbenv.executeSql("CREATE CATALOG hadoop_catalog WITH ( \n" +
" 'type'='iceberg',\n" +
" 'catalog-type'='hadoop', \n" +
" 'warehouse'='hdfs://cdh01:8020/user/iceberg/hadoop_catalog', \n" +
" 'property-version'='1'\n" +
")");
tbenv.executeSql("CREATE DATABASE if not exists iceberg_db");
// 目前不支持语法 tbenv.executeSql(" set execution.runtime-mode = streaming");
// 使用FlinkSQL程序的方式读取iceberg不太可行,可以使用FlinkStream 方式读取 iceberg
// 控制台没有打印出结果
tbenv.executeSql("SELECT * FROM hadoop_catalog.iceberg_db.t_iceberg_sample_1");
}
}
4.3 FlinkSQL 程序方式创建catalog 类型为hive 的iceberg表
创建类work.jiang.iceberg.flinksql.hivecatalog.FlinkSQLWriteHiveCatalogIcebergTable
代码如下
package work.jiang.iceberg.flinksql.hivecatalog;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
public class FlinkSQLWriteHiveCatalogIcebergTable {
public static void main(String[] args) throws Exception {
// 创建flink执行环境
System.setProperty("HADOOP_USER_NAME", "hdfs");
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings bsSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
env.setParallelism(1);
env.enableCheckpointing(10000);
StreamTableEnvironment tbenv = StreamTableEnvironment.create(env, bsSettings);
// 使用hive_catalog的形式创建表
tbenv.executeSql("CREATE CATALOG hive_catalog WITH (\n" +
" 'type'='iceberg',\n" +
" 'catalog-type'='hive',\n" +
" 'uri'='thrift://cdh01:9083',\n" +
" 'clients'='5',\n" +
" 'property-version'='1',\n" +
" 'warehouse'='hdfs://cdh01:8020/user/iceberg/hive_catalog'\n" +
")");
tbenv.executeSql("CREATE DATABASE if not exists hive_catalog.iceberg_db_hive");
// 简单表,hive里边查询不出数据,不建议这种方式建表
//tbenv.executeSql("CREATE TABLE if not exists `hive_catalog`.`iceberg_db_hive`.`sample` (\n" +
// " id BIGINT COMMENT 'unique id',\n" +
// " data STRING\n" +
// ")");
// 建表时,在表属性中配置hive的信息,可以使数据自动同步到hive
tbenv.executeSql("CREATE TABLE if not exists `hive_catalog`.`iceberg_db_hive`.`sample` (\n" +
" id BIGINT COMMENT 'unique id',\n" +
" data STRING\n" +
")WITH (\n" +
"\t'format-version'='1',\n" +
"\t'iceberg.mr.catalog'='hive',\n" +
"\t'engine.hive.enabled'='true',\n" +
"\t'catalog-name'='hive_catalog',\n" +
"\t'catalog-database'='iceberg_db_hive',\n" +
" 'type'='iceberg',\n" +
"\t'uri'='thrift://cdh01:9083',\n" +
" 'catalog-type'='hive',\n" +
" 'warehouse'='hdfs://cdh01:8020/user/iceberg/hive_catalog/iceberg_db/sample'\n" +
")");
tbenv.executeSql("insert into hive_catalog.iceberg_db_hive.sample(id,data) values (2,'bb')");
tbenv.executeSql("select * from hive_catalog.iceberg_db_hive.sample");
}
}
4.4 FlinkSQL 读取 hivecatalog类型的iceberg表
package work.jiang.iceberg.flinksql.hivecatalog;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
public class FlinkSQLReadHiveCatalogIcebergTable {
public static void main(String[] args) throws Exception {
// 创建flink执行环境
System.setProperty("HADOOP_USER_NAME", "hdfs");
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings bsSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
env.setParallelism(1);
env.enableCheckpointing(10000);
StreamTableEnvironment tbenv = StreamTableEnvironment.create(env, bsSettings);
tbenv.executeSql("CREATE CATALOG hive_catalog WITH (\n" +
" 'type'='iceberg',\n" +
" 'catalog-type'='hive',\n" +
" 'uri'='thrift://cdh01:9083',\n" +
" 'clients'='5',\n" +
" 'property-version'='1',\n" +
" 'warehouse'='hdfs://cdh01:8020/user/iceberg/hive_catalog'\n" +
")");
tbenv.executeSql("CREATE DATABASE if not exists hive_catalog.iceberg_db_hive");
tbenv.executeSql("select * from hive_catalog.iceberg_db_hive.sample");
}
}
5. FlinkSQL 使用命令行的方式操作iceberg
启动flinksql,把jar包加载进去,第二次及n次查询,都需要创建catalog
[root@flink01 flink-1.14.4]# bin/sql-client.sh embedded -j lib/iceberg-flink-runtime-1.14-0.13.2.jar -j lib/flink-sql-connector-hive-2.2.0_2.11-1.14.4.jar shell
– 1. 创建hadoop_catalog
Flink SQL> CREATE CATALOG hadoop_catalog WITH (
'type'='iceberg',
'catalog-type'='hadoop',
'warehouse'='hdfs://cdh01:8020/user/iceberg/hadoop_catalog',
'property-version'='1'
);
Flink SQL> show catalogs;
– 2. 创建database
Flink SQL> CREATE DATABASE if not exists iceberg_db;
– 3.创建非分区表和分区表
Flink SQL> DROP TABLE hadoop_catalog.iceberg_db.t_iceberg_sample_1;
Flink SQL> CREATE TABLE hadoop_catalog.iceberg_db.t_iceberg_sample_1 (
id BIGINT COMMENT 'unique id',
data STRING
)WITH (
'type'='iceberg',
'catalog-type'='hadoop',
'warehouse'='hdfs://cdh01:8020/user/iceberg/hadoop_catalog/iceberg_db/t_iceberg_sample_1',
'property-version'='1'
);
Flink SQL> insert into hadoop_catalog.iceberg_db.t_iceberg_sample_1(id, data) values(10, 'ff');
Flink SQL> SET sql-client.execution.result-mode=tableau;
Flink SQL> select * from hadoop_catalog.iceberg_db.t_iceberg_sample_1;
–分区表
Flink SQL> DROP TABLE hadoop_catalog.iceberg_db.sample_iceberg_partition;
Flink SQL> CREATE TABLE hadoop_catalog.iceberg_db.sample_iceberg_partition (
id BIGINT COMMENT 'unique id',
data STRING
) PARTITIONED BY (data)
WITH (
'type'='iceberg',
'catalog-type'='hadoop',
'warehouse'='hdfs://cdh01:8020/user/iceberg/hadoop_catalog/iceberg_db/sample_iceberg_partition',
'property-version'='1'
);
Flink SQL> INSERT into hadoop_catalog.iceberg_db.sample_iceberg_partition PARTITION(data='aa') SELECT id FROM hadoop_catalog.iceberg_db.t_iceberg_sample_1 WHERE data = 'aa';
Flink SQL> select *from hadoop_catalog.iceberg_db.sample_iceberg_partition;
– 4.查看存储结构
[root@cdh01 flink]# hadoop fs -ls -r hdfs://cdh01:8020/user/iceberg/hadoop_catalog/iceberg_db/t_iceberg_sample_1/metadata