flinkSQL-iceberg测试案例

最新推荐文章于 2024-05-16 04:50:43 发布

非本专业的IT从业人员

最新推荐文章于 2024-05-16 04:50:43 发布

阅读量1k

点赞数 4

本文链接：https://blog.csdn.net/pingfanderen5/article/details/125050424

版权

kafka-flinkSQL-iceberg测试案例

使用flink中datagen connector进行数据的模拟生成，同时将生成的数据写入到kafka中，模拟流式数据的生成；然后再使用flink对kafka中的数据进行消费，存入到数据湖iceberg中。
该流程类似与上一篇文章：kafka-flinkSQL-hudi的测试，有兴趣请参考上篇文章。

简单流程说明

1、使用flink模拟生成数据：

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

import java.util.Date;

/**
 * @author hyl
 * @date 2022/5/6
 * @Description
 * @ModifiedBy
 */
public class CreateDataAll {
    public static void main(String[] args) {


        // datagen source 随机生成数据

        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        final StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);

        String sourceDDL = "create table datagen_source (\n" +
                " id int,\n" +
                " data string,\n" +
                "price double,\n" +
                " ts as localtimestamp,\n" +
                " watermark for ts as ts\n" +
                ") with (\n" +
                " 'connector' = 'datagen',\n" +
                " 'rows-per-second'='" + args[0] + "',\n" +  // 每秒生成多少条数据，传入参数
                " 'fields.id.kind'='sequence',\n" +
                " 'fields.id.start'='" + args[1] + "',\n" +  // id的起始位置
                " 'fields.id.end'='" + args[2] + "',\n" +  // id的结束位置
                " 'fields.data.length'='50'\n" +
                ")";
        System.out.println(sourceDDL);
        tableEnv.executeSql(sourceDDL);



        // kafka sink
        String sinkDDL = "create table kafka_sink (\n" +
                " id int,\n" +
                " data string,\n" +
                "price double,\n" +
                " ts timestamp\n" +
                ") with (\n" +
                " 'connector' = 'kafka',\n" +
                " 'topic' = '" + args[3] + "',\n" +
                " 'properties.bootstrap.servers' = '192.168.100.14:9092',\n" +
                " 'properties.group.id' = 'flinkToKafka3',\n" +
                " 'scan.startup.mode' = 'earliest-offset',\n" +
                " 'format' = 'json',\n" +
                " 'json.fail-on-missing-field' = 'false',\n" +
                " 'json.ignore-parse-errors' = 'true'\n" +
                ")";
        System.out.println(sinkDDL);
        tableEnv.executeSql(sinkDDL);

        // insert into kafka,其实就是利用flink的自己模拟生成的数据，来发送给kafka，模拟流数据id>1000000 or
        String insertKafka = "insert into kafka_sink\n" +
                "select * from datagen_source ";
        System.out.println(insertKafka);
        tableEnv.executeSql(insertKafka);

        // print

        final Date date_end = new Date();
        System.out.println("起始时间：" + date_end.toString() + "*********************************************");


    }
}

2、使用flink处理kafka数据实时写入iceberg

import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * @author hyl
 * @date 2022/4/28
 * @Description 使用flink将kafka的数据拉来处理，实时写入iceberg中
 */
public class KafkaToIceberg {
    public static void main(String[] args) {

        System.setProperty("HADOOP_USER_NAME", "root");
        System.setProperty("user.name", "root");
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(4);
        env.enableCheckpointing(6000);
        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(5000);
        env.getCheckpointConfig().setCheckpointTimeout(60000);
        env.getCheckpointConfig().setTolerableCheckpointFailureNumber(2);
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);

        env.getCheckpointConfig().enableExternalizedCheckpoints(
                CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        env.getCheckpointConfig().enableUnalignedCheckpoints();
        env.getCheckpointConfig().setCheckpointStorage("hdfs://192.168.100.11:9000/tmp/hudi/testFlinkHudi/checkpoint/dir");

        final StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);


//        // kafka source
        String kafkaSourceDDL = "create table kafka_source (\n" +
                " id int,\n" +
                " data string,\n" +
                "price double,\n" +
                " ts timestamp\n" +
                ") with (\n" +
                " 'connector' = 'kafka',\n" +
                " 'topic' = '" + args[0] + "',\n" +  // 消费topic
                " 'properties.bootstrap.servers' = '192.168.100.14:9092',\n" +
                " 'properties.group.id' = 'flinkToKafka3',\n" +
                " 'scan.startup.mode' = 'earliest-offset',\n" +
                " 'format' = 'json',\n" +
                " 'json.fail-on-missing-field' = 'false',\n" +
                " 'json.ignore-parse-errors' = 'true'\n" +
                ")";
        tableEnv.executeSql(kafkaSourceDDL);
        System.out.println(kafkaSourceDDL);


        // 使用table api 创建 hadoop catalog
        TableResult tableResult = tableEnv.executeSql("CREATE CATALOG hadoop_catalog WITH (\n" +
                "  'type'='iceberg',\n" +
                "  'catalog-type'='hadoop',\n" +
                "  'warehouse'='hdfs://192.168.100.11:9000/tmp/iceberg/flink_iceberg/FlinkClientTable/',\n" +
                "  'property-version'='2', \n" +
                "  'format-version'='2'\n" +
                ")");



        // 建立一张新表
        String createHudiTable = "create table if not exists hadoop_catalog.iceberg_hadoop_db."+args[1]+" (\n" +
                " id int,\n" +
                " data string,\n" +
                " price double,\n" +
                " ts timestamp(3),\n" +
                " part int,\n"+
                " event_time string,\n" +
                " event_date string, \n" +
                " PRIMARY KEY (`id`) NOT ENFORCED \n"+
                ") partitioned by ( part ) \n" +
                "WITH (\n" +
                " 'property-version'='2', \n" +  
                " 'format-version'='2' \n" + 
                ")";
        System.out.println(createHudiTable);
        tableEnv.executeSql(createHudiTable);

//        // 流式摄取到iceberg中
        String insertDML = "insert into hadoop_catalog.iceberg_hadoop_db."+args[1]+" \n" +
                "select\n" +
                " id,\n" +
                " data,\n" +
                " price,\n " +
                " ts,\n" +
                " id/100000 as part,\n"+
                " date_format(ts, 'yyyy-MM-dd HH:mm:ss') as event_time,\n" +
                " date_format(ts, 'yyyy-MM-dd') as event_date\n" +
                "from kafka_source";
        tableEnv.executeSql(insertDML);
    }
}

POM文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>FlinkToIceberg</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <!-- project compiler -->
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <!-- maven compiler-->
        <scala.maven.plugin.version>3.2.2</scala.maven.plugin.version>
        <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
        <maven.assembly.plugin.version>3.1.1</maven.assembly.plugin.version>
        <!-- sdk -->
        <java.version>1.8</java.version>
        <!-- engine-->
        <hadoop.version>2.9.2</hadoop.version>
        <flink.version>1.13.6</flink.version>
        <iceberg.version>0.13.1</iceberg.version>
        <!--        <flink.version>1.12.2</flink.version>-->
        <!--        <hoodie.version>0.9.0</hoodie.version>-->
        <hive.version>2.3.9</hive.version>

        <!--         <scope.type>provided</scope.type>-->
        <scope.type>compile</scope.type>
    </properties>

    <dependencies>

        <!-- flink Dependency 这个是本地可以查看web执行界面-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-runtime-web_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <!--flink-core核心包-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <!--实现自定义connector接口-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-common</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>
        <!--导入flink-table的java版本api-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java-bridge_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>
        <!--运行在本地的idea，需要加入以下两个模块-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.11</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-json</artifactId>
            <version>${flink.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.11</artifactId>
            <version>1.13.6</version>
        </dependency>

        <!-- hadoop Dependency,之前没有写这个依赖怎么存储到Hadoop上面-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
            <scope>${scope.type}</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
            <scope>${scope.type}</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
            <scope>${scope.type}</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-flink-runtime-1.13 -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-flink-runtime-1.13</artifactId>
            <version>0.13.1</version>
        </dependency>


        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-api -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-api</artifactId>
            <version>0.13.1</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.iceberg/iceberg-data -->
        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-data</artifactId>
            <version>0.13.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.iceberg</groupId>
            <artifactId>iceberg-flink</artifactId>
            <version>0.13.1</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <!--瘦子打包，只打包代码-->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <classpathPrefix>libs/</classpathPrefix>
                            <mainClass>
                                KafkaToIceberg
                            </mainClass>
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>

谢谢观看，如果感兴趣flink-hudi案例测试，可以参考笔者之前的文章。

非本专业的IT从业人员

关注

4
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
flinkSQL-iceberg测试案例

kafka-flinkSQL-iceberg测试案例使用flink中datagen connector进行数据的模拟生成，同时将生成的数据写入到kafka中，模拟流式数据的生成；然后再使用flink对kafka中的数据进行消费，存入到数据湖iceberg中。
复制链接

扫一扫