版本
flink:1.13.5
hudi:0.10.1
hive:2.1.1
starrocks:2.5.3
准备
导入相关依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime-blink_2.11</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink-bundle_2.11</artifactId>
<scope>provided</scope>
</dependency>
实践
在maven里官包hudi-flink-bundle_2.11是没有hive相关的依赖的
需要自己去github: GitHub - apache/hudi: Upserts, Deletes And Incremental Processing on Big Data. clone代码下来编译
编译打包hudi
cd packaging/hudi-flink-bundle
修改pom.xml文件
找到profile=flink-bundle-shade-hive2(由于hive版本2.1.1,可以自行根据hive版本来调整)
修改hive.version
<profile>
<id>flink-bundle-shade-hive2</id>
<properties>
<hive.version>2.1.1</hive.version>
<flink.bundle.hive.scope>compile</flink.bundle.hive.scope>
</properties>
<dependencies>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-service-rpc</artifactId>
<version>${hive.version}</version>
<scope>${flink.bundle.hive.scope}</scope>
</dependency>
</dependencies>
</profile>
要确保dependencies中的hive依赖都打包上,hudi-0.10.1的代码中hive-exec缺少了,加上scope
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<scope>${flink.bundle.hive.scope}</scope>
</dependency>
使用maven编译打包,在packaging/hudi-flink-bundle目录下执行
mvn install -Dmaven.test.skip=true -Drat.skip=true -Pflink-bundle-shade-hive2
打包完成后会将maven仓库的hudi-flink-bundle_2.11会生成出来,大概60多MB
flink任务sql
-- 使用flink的数据生成器
-- 创建数据生成表
CREATE TABLE datagen (
f_sequence INT,
f_random INT,
f_random_str STRING,
ts AS localtimestamp,
WATERMARK FOR ts AS ts
) WITH (
'connector' = 'datagen',
-- optional options --
'rows-per-second'='1',
'fields.f_sequence.kind'='sequence',
'fields.f_sequence.start'='1',
'fields.f_sequence.end'='1000',
'fields.f_random.min'='1',
'fields.f_random.max'='1000',
'fields.f_random_str.length'='10'
);
-- 创建hudi数据表
CREATE TABLE hudi_datagen(
f_sequence INT,
f_random INT,
f_random_str STRING,
ts timestamp(0)
) with (
'connector' = 'hudi',
'table.type' = 'COPY_ON_WRITE',
'path' = '${path}',
'hoodie.table.name' = 'hudi_datagen',
'write.precombine.field' = 'ts',
'hoodie.datasource.write.recordkey.field' = 'f_sequence',
'hive_sync.enable' = 'true',
'hive_sync.username' = '${username}',
'hive_sync.db' = 'default',
'hive_sync.table' = 'hudi_datagen',
'hive_sync.mode' = 'hms',
'hive_sync.metastore.uris' = 'thrift://${ip}:${port}'
);
insert into hudi_datagen
select
*
from datagen
;
把生成的数据写到path中,等待一次flink checkpoint之后数据会刷到hudi表里就可以使用hive去查询了。
-- 展示表,会看到hudi_datagen表
show tables from default;
-- 查询
select * from default.hudi_datagen limit 10;
使用starrocks查询hudi
创建hudi catalog
-- 创建hudi catalog
create external catalog hudi
properties
(
"type" = "hudi",
"hive.metastore.type" = "hive",
"hive.metastore.uris" = "thrift://${ip}:${port}"
);
-- 查询数据
select * from hudi.default.hudi_datagen limit 10;