FlinkSQL消费Kafka写入Hive表

环境版本:

hadoop-3.1.0

hive-3.1.2

flink-1.13.2

一、开发

Maven引入依赖项:

    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-java</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-table-api-java-bridge_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-connector-kafka_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-connector-hive_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-statebackend-rocksdb_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-streaming-scala_2.11</artifactId>
      <version>${flink.version}</version>
      <!--<scope>provided</scope>-->
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-clients_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hive</groupId>
      <artifactId>hive-exec</artifactId>
      <version>${hive.version}</version>
    </dependency>
    <!--用于向hdfs写paruqet-->
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-parquet_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-avro</artifactId>
      <version>${flink.version}</version>
    </dependency>

java代码示例:

package teld;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import java.time.Duration;

/**
 * @Auther: lixz
 * @Date: 2022/10/13/9:38
 * @Description:  有hive依赖冲突问题暂停
 */
public class Kafka2Hive {
    public static void main( String[] args ) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build();
        StreamTableEnvironment tEnv = StreamTableEnvironment.create(env,settings);
        /**
         * hive环境
         */
//        System.setProperty("HADOOP_USER_NAME","hdfs");
        String name            = "myhive";
        String defaultDatabase = "test";
        //这里版本号一定要与hive-exec包版本一致,否则报错:NoSuchMethodException: org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(org.apache.hadoop.hive.conf.HiveConf)
        String hive_version = "3.1.2";
        String hiveConfDir     = "/opt/hive-3.1.2/conf";
        HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir,hive_version);
        tEnv.registerCatalog("myhive", hive);
        tEnv.useCatalog("myhive");
        tEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
        //接入kafka
        KafkaSource<String> source = KafkaSource.<String>builder()
                .setBootstrapServers("192.168.78.1:9092")
                .setTopics("test4")
                .setGroupId("my-group")
                .setStartingOffsets(OffsetsInitializer.latest())
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .build();
        //接入kafka流
        DataStreamSource<String> stream = env.fromSource(source,
                WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(5)), "Kafka Source");
        DataStream<MyUser> dataStream = stream.map(new MapFunction<String, MyUser>() {
            @Override
            public MyUser map(String s) throws Exception {
                String[] arr = s.split(",");
                return new MyUser(arr[0], arr[1], Integer.valueOf(arr[2]));
            }
        }).returns(MyUser.class);
        //创建动态表
        tEnv.createTemporaryView("MyUser",dataStream);
        //创建hive表(如果hive中该表不存在会自动在hive上创建,也可以提前在hive中建好该表,flinksql中就无需再执行建表SQL,因为用了hive的catalog,flinksql运行时会找到表)
        tEnv.executeSql("CREATE TABLE IF NOT EXISTS `myhive`.`test`.`useroplog` \n" +
                "(\n" +
                "`ID` STRING,\n" +
                "`NAME` STRING,\n" +
                "`AGE` INT\n" +
                ") \n" +
                "partitioned by(`DAY` STRING)\n" +
                "STORED AS parquet TBLPROPERTIES(\n" +
                //小文件自动合并,1.12版的新特性,解决了实时写hive产生的小文件问题
                "'auto-compaction'='true',\n" +
                //合并后的最大文件大小
                "'compaction.file-size'='128MB',\n"+
                "'format' = 'parquet',\n"+
                //压缩方式
                "'parquet.compression'='GZIP',\n"+
                //如果每小时一个分区,这个参数可设置为1 h,这样意思就是说数据延后一小时写入hdfs,能够达到数据的真确性,如果分区是天,这个参数也没必要设置了,今天的数据明天才能写入,时效性太差
                "'sink.partition-commit.delay'='30 s',\n" +
                //metastore值是专门用于写入hive的,也需要指定success-file
                //这样检查点触发完数据写入磁盘后会创建_SUCCESS文件以及hive metastore上创建元数据,这样hive才能够对这些写入的数据可查
                "'sink.partition-commit.policy.kind'='metastore,success-file'\n" +
                ")");
        //写hive表
        tEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
        tEnv.executeSql("insert into useroplog select *,'2022-10-13' as `DAY` from MyUser");
        //打印
//        tEnv.executeSql("select * from MyUser").print();
        env.execute();
    }
}

如果要输出的hive没有创建,执行任务后会自动创建,我们到hive下看看自定创建出来的表格式是什么样:

CREATE TABLE `useroplog`(
  `id` string, 
  `name` string, 
  `age` int)
PARTITIONED BY ( 
  `day` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://dss0:8020/user/hive/warehouse/test.db/useroplog'
TBLPROPERTIES (
  'auto-compaction'='true', 
  'bucketing_version'='2', 
  'format'='parquet', 
  'parquet.compression'='GZIP', 
  'sink.partition-commit.delay'='1min', 
  'sink.partition-commit.policy.kind'='success-file', 
  'transient_lastDdlTime'='1665629249')

打包代码,注意不要包含依赖避免依赖重读,我们用到的依赖都放到集群上

提交作业:

flink run-application \
-t yarn-application \
-c teld.Kafka2Hive \
-Dyarn.provided.lib.dirs="hdfs://dss0:8020/user/flink/flink-dependency-1.13.2;hdfs://dss0:8020/user/flink/flink-dependency-1.13.2/lib;hdfs://dss0:8020/user/flink/flink-dependency-1.13.2/plugin
s" \-Dyarn.application.name=flink2hivetest \
flink2hivetest-1.0-SNAPSHOT.jar \

提交成功截图:

当我们向kafka发送数据后就会写入到hive中,我们看下hive表生成的文件结构

实时写入时,分区会自动创建;我们来查询下

二、注意事项 

1、代码中创建的HiveCatalog中要指定hive版本,并且该版本一定要与依赖hive-exec的版本一致

2、集群HDFS上的依赖如下:

3、hive要开启metastore

bin/hive --service metastore >/dev/null 2>&1 &

开启后可以看9083端口是否存在

4、hive-site.xml配置

需要指定metastore uri

<property>
    <name>hive.metastore.uris</name>
    <value>thrift://192.168.78.12:9083</value>
  </property>
 

好的,您可以按照以下步骤操作: 1. 在 Flink 中添加 Kafka 和 MySQL 的依赖: ```xml <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-jdbc_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.23</version> </dependency> ``` 2. 创建 Flink SQL 的执行环境: ```java StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); EnvironmentSettings settings = EnvironmentSettings.newInstance() .useBlinkPlanner() .inStreamingMode() .build(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); ``` 3. 注册 Kafka 数据源和 MySQL 数据汇: ```java tableEnv.executeSql("CREATE TABLE kafka_source (\n" + " id INT,\n" + " name STRING,\n" + " age INT,\n" + " PRIMARY KEY (id) NOT ENFORCED\n" + ") WITH (\n" + " 'connector' = 'kafka',\n" + " 'topic' = 'test',\n" + " 'properties.bootstrap.servers' = 'localhost:9092',\n" + " 'properties.group.id' = 'testGroup',\n" + " 'format' = 'json',\n" + " 'scan.startup.mode' = 'earliest-offset'\n" + ")"); tableEnv.executeSql("CREATE TABLE mysql_sink (\n" + " id INT,\n" + " name STRING,\n" + " age INT,\n" + " PRIMARY KEY (id)\n" + ") WITH (\n" + " 'connector' = 'jdbc',\n" + " 'url' = 'jdbc:mysql://localhost:3306/test',\n" + " 'table-name' = 'user',\n" + " 'driver' = 'com.mysql.cj.jdbc.Driver',\n" + " 'username' = 'root',\n" + " 'password' = 'root'\n" + ")"); ``` 4. 使用 Flink SQL 读取 Kafka 数据源并将数据写入 MySQL 数据汇: ```java tableEnv.executeSql("INSERT INTO mysql_sink SELECT * FROM kafka_source"); env.execute(); ``` 这样就可以使用 Flink SQLKafka 中读取数据,并将数据写入 MySQL 数据库中了。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

头顶榴莲树

你的鼓励是我最大的动力~

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值