Hudi Flink Sql

Maven

<properties>
    <maven.compiler.source>8</maven.compiler.source>
    <maven.compiler.target>8</maven.compiler.target>
    <encoding>UTF-8</encoding>
    
    <scala.version>2.11.8</scala.version>
    <scala.binary.version>2.11</scala.binary.version>
    <flink.version>1.11.2</flink.version>
    <hoodie.version>0.8.0</hoodie.version>
</properties>

<dependencies>
    <dependency>
        <groupId>org.apache.hudi</groupId>
        <artifactId>hudi-flink-client</artifactId>
        <version>${hoodie.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.hudi</groupId>
        <artifactId>hudi-flink-bundle_${scala.binary.version}</artifactId>
        <version>${hoodie.version}</version>
    </dependency>

    <!-- other -->
    <dependency>
        <groupId>org.apache.avro</groupId>
        <artifactId>avro</artifactId>
        <version>1.10.0</version>
    </dependency>
    
    <!-- flink -->
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-csv</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-java</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-connector-kafka_2.11</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-table-planner-blink_2.11</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-table-api-java-bridge_2.11</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-clients_${scala.binary.version}</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-scala_${scala.binary.version}</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
        <version>${flink.version}</version>
    </dependency>

    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-core</artifactId>
        <version>${flink.version}</version>
    </dependency>
</dependencies>

数据准备

------ csv data ------------------
sendor_4,1714583000,55.00386985053892,SZ
sendor_3,1714583000,20.731258179725796,SZ
sendor_9,1714583000,69.41300298965302,SZ
sendor_8,1714583000,47.82942764820756,SZ
sendor_6,1714583001,71.90507480073437,SZ
sendor_6,1714583002,72.100,BJ
sendor_6,1714589003,73.000,BJ
sendor_6,1714589004,74.000,BJ
sendor_6,1714589005,84.000,BJ
sendor_6,1714589006,95.000,SH
sendor_6,1714589007,77.000,SH
sendor_1,1714589008,78.000,SH

------------------------ kakfa data -----------------
{"id":"sensor_1","ts":"16145830009","temp":78.71,"location":"HZ"}
{"id":"sensor_2","ts":"16145830009","temp":78.72,"location":"HZ"}
{"id":"sensor_3","ts":"16145830009","temp":78.73,"location":"SH"}
{"id":"sensor_4","ts":"16145830009","temp":78.74,"location":"SH"}
{"id":"sensor_5","ts":"16145830009","temp":78.75,"location":"HZ"}
{"id":"sensor_1","ts":"16145830009","temp":78.76,"location":"HZ"}
{"id":"sensor_2","ts":"16145830009","temp":78.77,"location":"HZ"}
{"id":"sensor_3","ts":"16145830009","temp":78.78,"location":"SH"}

写MOR

读取csv文件作为source,插入hudi mor表中

--create table source(
--    id VARCHAR(20),
--    ts VARCHAR(20),
--    temp DOUBLE,
--    location VARCHAR(20)
--) with (
--    'connector.type' = 'filesystem',
--    'format.type' = 'csv',
--    'connector.path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor2.txt'
--);

-- 使用kakfa source
CREATE TABLE source (
  id STRING,
  ts STRING,
  temp DOUBLE,
  location STRING
) with (
    'connector.type' = 'kafka',
    'connector.version' = 'universal',
    'connector.topic' = 'hudi_on_flink2',
    'connector.properties.group.id' = 'hudi_on_flink2',
    'connector.startup-mode' = 'earliest-offset',
    'connector.properties.zookeeper.connect' = 'localhost:2181',
    'connector.properties.bootstrap.servers' = 'localhost:9092',
    'format.type' = 'json'
);

CREATE TABLE sensor(
  id VARCHAR(20) PRIMARY KEY NOT ENFORCED, ## 指定主键
  ts VARCHAR(10),
  temp DOUBLE,
  location VARCHAR(20)
)
PARTITIONED BY (location)
WITH (
  'connector' = 'hudi',
  'path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor1',
  'compaction.delta_commits' = '1',
  'compaction.async.enabled' = 'true',
  'compaction.trigger.strategy' = 'num_commits',
  'compaction.delta_seconds' = '10',
  'table.type' = 'MERGE_ON_READ'
);
package com.hjl.table;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @Description
 * @Author jiale.he
 * @Date 2021-04-22 18:37 周四
 */
public class TableDemo5HudiWrite {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, EnvironmentSettings.newInstance()
                .useBlinkPlanner()
                .inStreamingMode()
                .build());

        String sinkDLL = "CREATE TABLE sensor(\n" +
                "  id VARCHAR(20) PRIMARY KEY NOT ENFORCED,\n" +
                "  ts VARCHAR(10),\n" +
                "  temp DOUBLE,\n" +
                "  location VARCHAR(20)\n" +
                ")\n" +
                "PARTITIONED BY (location)\n" +
                "WITH (\n" +
                "  'connector' = 'hudi',\n" +
                "  'path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor1',\n" +
                "  'compaction.delta_commits' = '1',\n" +
                "  'compaction.async.enabled' = 'true',\n" +
                "  'compaction.trigger.strategy' = 'num_commits',\n" +
                "  'table.type' = 'MERGE_ON_READ'\n" +
                ")";

        String sourceDLL = "create table source(\n" +
                "    id VARCHAR(20),\n" +
                "    ts VARCHAR(20),\n" +
                "    temp DOUBLE,\n" +
                "    location VARCHAR(20)\n" +
                ") with (\n" +
                "    'connector.type' = 'filesystem',\n" +
                "    'format.type' = 'csv',\n" +
                "    'connector.path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor2.txt'\n" +
                ")";

        String query = "insert into sensor select id,ts,temp,location from source";
        
        tableEnv.executeSql(sourceDLL);
        tableEnv.executeSql(sinkDLL);
        tableEnv.executeSql(query);
    }
}

在这里插入图片描述

问题

我把
compaction.delta_commits
compaction.async.enabled
compaction.trigger.strategy
三个参数反复调试之后,还是没能触发compaction,写下来的都是log,没有合并成parquet

解决方法

找到没有触发compaction的原因是因为没有设置flink checkpoint
请看以下示例

package com.hjl.table;

import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

/**
 * @Description
 * @Author jiale.he
 * @Date 2021-05-06 18:37
 */
public class TableDemo5HudiWrite2 {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        // checkpoint conf
        env.setParallelism(1);
        env.enableCheckpointing(5000);
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
        env.setStateBackend(new FsStateBackend("file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/cp"));
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, EnvironmentSettings.newInstance()
                .useBlinkPlanner()
                .inStreamingMode()
                .build());

        String sinkDLL = "CREATE TABLE sensor(\n" +
                "  id VARCHAR(20) PRIMARY KEY NOT ENFORCED,\n" +
                "  ts VARCHAR(10),\n" +
                "  temp DOUBLE,\n" +
                "  location VARCHAR(20)\n" +
                ")\n" +
                "PARTITIONED BY (location)\n" +
                "WITH (\n" +
                "  'connector' = 'hudi',\n" +
                "  'path' = 'file:///Users/jiale.he/IdeaProjects/hudi-flink/src/main/resources/sensor1',\n" +
                "  'compaction.delta_commits' = '2',\n" +
                "  'compaction.async.enabled' = 'true',\n" +
                "  'compaction.trigger.strategy' = 'num_commits',\n" +
                "  'compaction.delta_seconds' = '5',\n" +
                "  'table.type' = 'MERGE_ON_READ'\n" +
                ")";

        String sourceDLL = "CREATE TABLE source (\n" +
                "  id STRING,\n" +
                "  ts STRING,\n" +
                "  temp DOUBLE,\n" +
                "  location STRING\n" +
                ") with (\n" +
                "    'connector.type' = 'kafka',\n" +
                "    'connector.version' = 'universal',\n" +
                "    'connector.topic' = 'hudi_on_flink2',\n" +
                "    'connector.properties.group.id' = 'hudi_on_flink2',\n" +
                "    'connector.startup-mode' = 'earliest-offset',\n" +
                "    'connector.properties.zookeeper.connect' = 'localhost:2181',\n" +
                "    'connector.properties.bootstrap.servers' = 'localhost:9092',\n" +
                "    'format.type' = 'json'\n" +
                ")";


        String query = "insert into sensor select id,ts,temp,location from source";


        tableEnv.executeSql(sourceDLL);
        tableEnv.executeSql(sinkDLL);
        tableEnv.executeSql(query);
//        Table table = tableEnv.sqlQuery("select * from source");
//        tableEnv.toAppendStream(table, Row.class).print();
//        env.execute("flink table api hudi test1");

    }
}

在这里插入图片描述
成功触发压缩操作,压缩log日志生成parquet文件

Hudi Flink Write Config 官方文档
在这里插入图片描述

写COW

与MOR类似,将表类型换成 COPY_ON_WRITE 即可

  • 2
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值