Maven
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<flink.version>1.11.2</flink.version>
<hoodie.version>0.8.0</hoodie.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink-client</artifactId>
<version>${hoodie.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink-bundle_${scala.binary.version}</artifactId>
<version>${hoodie.version}</version>
</dependency>
<!-- other -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.10.0</version>
</dependency>
<!-- flink -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-csv</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
数据准备
------ csv data ------------------
sendor_4,1714583000,55.00386985053892,SZ
sendor_3,1714583000,20.731258179725796,SZ
sendor_9,1714583000,69.41300298965302,SZ
sendor_8,1714583000,47.82942764820756,SZ
sendor_6,1714583001,71.90507480073437,SZ
sendor_6,1714583002,72.100,BJ
sendor_6,1714589003,73.000,BJ
sendor_6,1714589004,74.000,BJ
sendor_6,1714589005,84.000,BJ
sendor_6,1714589006,95.000,SH
sendor_6,1714589007,77.000,SH
sendor_1,1714589008,78.000,SH
------------------------ kakfa data -----------------
{"id":"sensor_1","ts":"16145830009","temp":78.71,"location":"HZ"}
{"id":"sensor_2","ts":"16145830009","temp":78.72,"location":"HZ"}
{"id":"sensor_3","ts":"16145830009","temp":78.73,"location":"SH"}
{"id":"sensor_4","ts":"16145830009","temp":78.74,"location":"SH"}
{"id":"sensor_5","ts":"16145830009","temp":78.75,"location":"HZ"}
{"id":"sensor_1","ts":"16145830009","temp":78.76,"location":"HZ"}
{"id":"sensor_2","ts":"16145830009","temp":78.77,"location":"HZ"}
{"id":"sensor_3","ts":"16145830009","temp":78.78,"location":"SH"}
写MOR
读取csv文件作为source,插入hudi mor表中
--create table source(
-- id VARCHAR(20),
-- ts VARCHAR(20),
-- temp DOUBLE,
-- location VARCHAR(20)
--) with (
-- 'connector.type' = 'filesystem',
-- 'format.type' = 'csv',
-- 'connector.path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor2.txt'
--);
-- 使用kakfa source
CREATE TABLE source (
id STRING,
ts STRING,
temp DOUBLE,
location STRING
) with (
'connector.type' = 'kafka',
'connector.version' = 'universal',
'connector.topic' = 'hudi_on_flink2',
'connector.properties.group.id' = 'hudi_on_flink2',
'connector.startup-mode' = 'earliest-offset',
'connector.properties.zookeeper.connect' = 'localhost:2181',
'connector.properties.bootstrap.servers' = 'localhost:9092',
'format.type' = 'json'
);
CREATE TABLE sensor(
id VARCHAR(20) PRIMARY KEY NOT ENFORCED, ## 指定主键
ts VARCHAR(10),
temp DOUBLE,
location VARCHAR(20)
)
PARTITIONED BY (location)
WITH (
'connector' = 'hudi',
'path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor1',
'compaction.delta_commits' = '1',
'compaction.async.enabled' = 'true',
'compaction.trigger.strategy' = 'num_commits',
'compaction.delta_seconds' = '10',
'table.type' = 'MERGE_ON_READ'
);
package com.hjl.table;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
/**
* @Description
* @Author jiale.he
* @Date 2021-04-22 18:37 周四
*/
public class TableDemo5HudiWrite {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build());
String sinkDLL = "CREATE TABLE sensor(\n" +
" id VARCHAR(20) PRIMARY KEY NOT ENFORCED,\n" +
" ts VARCHAR(10),\n" +
" temp DOUBLE,\n" +
" location VARCHAR(20)\n" +
")\n" +
"PARTITIONED BY (location)\n" +
"WITH (\n" +
" 'connector' = 'hudi',\n" +
" 'path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor1',\n" +
" 'compaction.delta_commits' = '1',\n" +
" 'compaction.async.enabled' = 'true',\n" +
" 'compaction.trigger.strategy' = 'num_commits',\n" +
" 'table.type' = 'MERGE_ON_READ'\n" +
")";
String sourceDLL = "create table source(\n" +
" id VARCHAR(20),\n" +
" ts VARCHAR(20),\n" +
" temp DOUBLE,\n" +
" location VARCHAR(20)\n" +
") with (\n" +
" 'connector.type' = 'filesystem',\n" +
" 'format.type' = 'csv',\n" +
" 'connector.path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor2.txt'\n" +
")";
String query = "insert into sensor select id,ts,temp,location from source";
tableEnv.executeSql(sourceDLL);
tableEnv.executeSql(sinkDLL);
tableEnv.executeSql(query);
}
}
问题
我把
compaction.delta_commits
compaction.async.enabled
compaction.trigger.strategy
三个参数反复调试之后,还是没能触发compaction,写下来的都是log,没有合并成parquet
解决方法
找到没有触发compaction的原因是因为没有设置flink checkpoint
请看以下示例
package com.hjl.table;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/**
* @Description
* @Author jiale.he
* @Date 2021-05-06 18:37
*/
public class TableDemo5HudiWrite2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// checkpoint conf
env.setParallelism(1);
env.enableCheckpointing(5000);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.setStateBackend(new FsStateBackend("file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/cp"));
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build());
String sinkDLL = "CREATE TABLE sensor(\n" +
" id VARCHAR(20) PRIMARY KEY NOT ENFORCED,\n" +
" ts VARCHAR(10),\n" +
" temp DOUBLE,\n" +
" location VARCHAR(20)\n" +
")\n" +
"PARTITIONED BY (location)\n" +
"WITH (\n" +
" 'connector' = 'hudi',\n" +
" 'path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor1',\n" +
" 'compaction.delta_commits' = '2',\n" +
" 'compaction.async.enabled' = 'true',\n" +
" 'compaction.trigger.strategy' = 'num_commits',\n" +
" 'compaction.delta_seconds' = '5',\n" +
" 'table.type' = 'MERGE_ON_READ'\n" +
")";
String sourceDLL = "CREATE TABLE source (\n" +
" id STRING,\n" +
" ts STRING,\n" +
" temp DOUBLE,\n" +
" location STRING\n" +
") with (\n" +
" 'connector.type' = 'kafka',\n" +
" 'connector.version' = 'universal',\n" +
" 'connector.topic' = 'hudi_on_flink2',\n" +
" 'connector.properties.group.id' = 'hudi_on_flink2',\n" +
" 'connector.startup-mode' = 'earliest-offset',\n" +
" 'connector.properties.zookeeper.connect' = 'localhost:2181',\n" +
" 'connector.properties.bootstrap.servers' = 'localhost:9092',\n" +
" 'format.type' = 'json'\n" +
")";
String query = "insert into sensor select id,ts,temp,location from source";
tableEnv.executeSql(sourceDLL);
tableEnv.executeSql(sinkDLL);
tableEnv.executeSql(query);
// Table table = tableEnv.sqlQuery("select * from source");
// tableEnv.toAppendStream(table, Row.class).print();
// env.execute("flink table api hudi test1");
}
}
成功触发压缩操作,压缩log日志生成parquet文件
写COW
与MOR类似,将表类型换成 COPY_ON_WRITE 即可