Flink-sql kafka 实时写入Hive

Flink-sql kafka 实时写入Hive

参考文档

时间定义
https://nightlies.apache.org/flink/flink-docs-release-1.13/zh/docs/dev/table/concepts/time_attributes/

环境

flink 1.13.1
java 1.8
scala 2.11
hive 2.1.1-cdh6.1.1
hadoop 3.0.0-cdh6.1.1

pom

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <flink.version>1.13.1</flink.version>
    <scala.version>2.11</scala.version>
<!--    <hive.version>2.1.1</hive.version>-->
    <hive.version>2.1.1-cdh6.1.1</hive.version>
<!--    <hadoop.version>3.0.0</hadoop.version>-->
    <hadoop.version>3.0.0-cdh6.1.1</hadoop.version>
    <log4j.version>2.8.2</log4j.version>
  </properties>
  <!--    <dependency>-->
  <!--      <groupId>org.apache.flink</groupId>-->
  <!--&lt;!&ndash;      <artifactId>flink-sql-connector-hive-${hive.version}_${scala.version}</artifactId>&ndash;&gt;-->
  <!--      <artifactId>flink-sql-connector-hive-2.2.0_${scala.version}</artifactId>-->
  <!--      <version>${flink.version}</version>-->
  <!--      <scope>provided</scope>-->
  <!--    </dependency>-->
      <repositories>
          <repository>
              <id>cloudera</id>
              <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
          </repository>
          <!--    <repository>-->
          <!--      <id>spring-plugin</id>-->
          <!--      <url>https://repo.spring.io/plugins-release/</url>-->
          <!--    </repository>-->
      </repositories>

<!--  <dependencies>-->
<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-scala_${scala.version}</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--    </dependency>-->

<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-streaming-scala_${scala.version}</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <exclusions>-->
<!--        <exclusion>-->
<!--          <artifactId>commons-cli</artifactId>-->
<!--          <groupId>commons-cli</groupId>-->
<!--        </exclusion>-->
<!--        <exclusion>-->
<!--          <artifactId>commons-io</artifactId>-->
<!--          <groupId>commons-io</groupId>-->
<!--        </exclusion>-->
<!--      </exclusions>-->
<!--    </dependency>-->
<!--    <dependency>-->
<!--        <groupId>commons-cli</groupId>-->
<!--        <artifactId>commons-cli</artifactId>-->
<!--        <version>1.4</version>-->
<!--    </dependency>-->
<!--    &lt;!&ndash; Flink Dependency &ndash;&gt;-->
<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-connector-hive_2.11</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <scope>provided</scope>-->
<!--    </dependency>-->

<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-table-api-java-bridge_2.11</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <scope>provided</scope>-->
<!--    </dependency>-->

<!--    &lt;!&ndash; Hive Dependency &ndash;&gt;-->
<!--    <dependency>-->
<!--      <groupId>org.apache.hive</groupId>-->
<!--      <artifactId>hive-exec</artifactId>-->
<!--      <version>${hive.version}</version>-->
<!--      <exclusions>-->
<!--        <exclusion>-->
<!--          <artifactId>commons-cli</artifactId>-->
<!--          <groupId>commons-cli</groupId>-->
<!--        </exclusion>-->
<!--        <exclusion>-->
<!--          <artifactId>protobuf-java</artifactId>-->
<!--          <groupId>com.google.protobuf</groupId>-->
<!--        </exclusion>-->
<!--      </exclusions>-->
<!--      &lt;!&ndash;      <scope>provided</scope>&ndash;&gt;-->
<!--    </dependency>-->
<!--  </dependencies>-->


  <dependencies>
    <dependency>
        <groupId>commons-cli</groupId>
        <artifactId>commons-cli</artifactId>
        <version>1.4</version>
    </dependency>
    <!-- Apache Flink dependencies -->
    <!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-scala_2.11</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <scope>provided</scope>-->
<!--    </dependency>-->
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-streaming-scala_${scala.version}</artifactId>
      <version>${flink.version}</version>
<!--      <scope>provided</scope>-->
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
<!--      <artifactId>flink-table-api-java-bridge_2.11</artifactId>-->
      <artifactId>flink-table-api-scala-bridge_2.11</artifactId>
      <version>${flink.version}</version>
<!--      <scope>provided</scope>-->
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-table-api-java-bridge_2.11</artifactId>
<!--      <artifactId>flink-table-api-scala-bridge_2.11</artifactId>-->
      <version>${flink.version}</version>
<!--      <scope>provided</scope>-->
    </dependency>

<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-streaming-scala_${scala.version}</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <scope>provided</scope>-->
<!--    </dependency>-->

    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-clients_${scala.version}</artifactId>
      <version>${flink.version}</version>
<!--      <scope>provided</scope>-->
    </dependency>

    <!-- Flink SQL dependencies -->

<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--&lt;!&ndash;      <artifactId>flink-table-api-java-bridge_2.11</artifactId>&ndash;&gt;-->
<!--      <artifactId>flink-table-api-scala-bridge_2.11</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <scope>provided</scope>-->
<!--    </dependency>-->
<!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-table-planner_${scala.version}</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <scope>provided</scope>-->
<!--      <exclusions>-->
<!--        <exclusion>-->
<!--          <artifactId>slf4j-api</artifactId>-->
<!--          <groupId>org.slf4j</groupId>-->
<!--        </exclusion>-->
<!--      </exclusions>-->
<!--    </dependency>-->
    <!-- https://mvnrepository.com/artifact/com.ibm/com.ibm.icu -->
    <!-- https://mvnrepository.com/artifact/com.ibm/com.ibm.icu -->
<!--    <dependency>-->
<!--      <groupId>com.ibm</groupId>-->
<!--      <artifactId>com.ibm.icu</artifactId>-->
<!--      <version>3.6.1.v20070906</version>-->
<!--    </dependency>-->


    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-table-planner-blink_2.11</artifactId>
      <version>${flink.version}</version>
      <scope>provided</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-runtime-blink -->
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-table-runtime-blink_${scala.version}</artifactId>
      <version>${flink.version}</version>
      <scope>provided</scope>
    </dependency>


    <!--    <dependency>-->
<!--      <groupId>org.apache.flink</groupId>-->
<!--      <artifactId>flink-streaming-scala_2.11</artifactId>-->
<!--      <version>${flink.version}</version>-->
<!--      <scope>provided</scope>-->
<!--    </dependency>-->
    <!-- https://mvnrepository.com/artifact/org.apache.calcite/calcite-core -->


    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-table-common</artifactId>
      <version>${flink.version}</version>
<!--      <scope>provided</scope>-->
    </dependency>
    <!-- Add connector dependencies here. They must be in the default scope (compile). -->


    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-sql-connector-kafka_${scala.version}</artifactId>
      <version>${flink.version}</version>
    </dependency>

    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-json</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-connector-hive_2.11</artifactId>
      <version>${flink.version}</version>
      <!--             <scope>provided</scope>-->
    </dependency>

    <dependency>
      <groupId>org.apache.hive</groupId>
      <artifactId>hive-exec</artifactId>
      <version>${hive.version}</version>
<!--      <version>2.1.1-cdh6.1.1</version>-->
      <exclusions>
        <exclusion>
          <artifactId>calcite-avatica</artifactId>
          <groupId>org.apache.calcite</groupId>
        </exclusion>
        <exclusion>
          <artifactId>calcite-core</artifactId>
          <groupId>org.apache.calcite</groupId>
        </exclusion>
        <exclusion>
          <artifactId>calcite-linq4j</artifactId>
          <groupId>org.apache.calcite</groupId>
        </exclusion>
      </exclusions>
      <!--             <scope>provided</scope>-->
    </dependency>

    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
      <scope>provided</scope>
    </dependency>
    <!--<dependency>-->
    <!--<groupId>org.apache.flink</groupId>-->
    <!--<artifactId>flink-shaded-hadoop-3</artifactId>-->
    <!--<version>3.1.1.7.1.1.0-565-9.0</version>-->
    <!--<scope>provided</scope> -->
    <!--</dependency>-->


    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-parquet_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>

    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-orc_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>

    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-connector-kafka_${scala.version}</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-connector-base</artifactId>
      <version>${flink.version}</version>
    </dependency>


    <!--    <dependency>-->
<!--      <groupId>org.apache.kafka</groupId>-->
<!--      <artifactId>kafka-clients</artifactId>-->
<!--      <version>1.0.1</version>-->
<!--    </dependency>-->
<!--    <dependency>-->
<!--      <groupId>org.projectlombok</groupId>-->
<!--      <artifactId>lombok</artifactId>-->
<!--      <version>1.16.18</version>-->
<!--    </dependency>-->


<!--     Add logging framework, to produce console output when running in the IDE.-->
<!--     These dependencies are excluded from the application JAR by default.-->
    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-slf4j-impl</artifactId>
      <version>${log4j.version}</version>
      <scope>runtime</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-api</artifactId>
      <version>${log4j.version}</version>
      <scope>runtime</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-core</artifactId>
      <version>${log4j.version}</version>
      <scope>runtime</scope>
    </dependency>

<!--    <dependency>-->
<!--      <groupId>com.alibaba</groupId>-->
<!--      <artifactId>fastjson</artifactId>-->
<!--      <version>1.2.72</version>-->
<!--    </dependency>-->
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.46</version>
    </dependency>
    <dependency>
      <groupId>org.apache.flink</groupId>
      <artifactId>flink-connector-jdbc_2.11</artifactId>
      <version>${flink.version}</version>
    </dependency>
    <!--<dependency>-->
    <!--<groupId>org.apache.flink</groupId>-->
    <!--<artifactId>flink-shaded-hadoop-2-uber</artifactId>-->
    <!--<version>${flink-shaded-hadoop.version}</version>-->
    <!--</dependency>-->
  </dependencies>

flink lib 文件

在这里插入图片描述

antlr-runtime-3.5.2.jar
com.ibm.icu-4.4.2.jar
commons-cli-1.4.jar
flink-connector-hive_2.11-1.13.1.jar
flink-connector-kafka_2.11-1.13.1.jar
flink-core-1.13.0.jar
flink-csv-1.13.1.jar
flink-dist_2.11-1.13.1.jar
flink-json-1.13.1.jar
flink-shaded-hadoop-2-uber-2.7.5-7.0.jar
flink-shaded-hadoop-2-uber-3.0.0-cdh6.3.0-7.0.jar
flink-shaded-zookeeper-3.4.14.jar
flink-sql-connector-hive-2.2.0_2.11-1.13.3.jar
flink-table-api-java-1.13.1.jar
flink-table-api-java-bridge_2.11-1.13.1.jar.0
flink-table-api-scala_2.11-1.13.1.jar.0
flink-table-planner_2.11-1.13.1.jar
flink-table-planner-blink_2.11-1.13.1.jar
flink-table-runtime-blink_2.11-1.13.1.jar
jindo-flink-sink-3.6.1.jar
jindofs-sdk-3.6.1.jar
log4j-1.2-api-2.12.1.jar
log4j-api-2.12.1.jar
log4j-core-2.12.1.jar
log4j-slf4j-impl-2.12.1.jar

Demo

package com.gwm.core.kafka

import org.apache.flink.api.scala._
import org.apache.flink.table.api._
import org.apache.flink.table.api.bridge.scala._
import com.gwm.utils.log.Logging
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect}
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.flink.table.catalog.hive.HiveCatalog

/**
 * @Author: 
 * @Description: sth todo
 */
object TestFlinkKafka extends Logging{
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment

    //checkpoint
    env.enableCheckpointing(1000 * 60,CheckpointingMode.EXACTLY_ONCE)
    env.getCheckpointConfig.setCheckpointStorage("hdfs://test/checkpoint/")

//    val env = StreamExecutionEnvironment.getExecutionEnvironment.enableCheckpointing(5 * 60 * 1000)
//    val checkpointConfig = env.getCheckpointConfigcheckpointConfig.setMinPauseBetweenCheckpoints(2 * 60 * 1000)
//    checkpointConfig.setCheckpointTimeout(3 * 60 * 1000)checkpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
//

    val settings = EnvironmentSettings.newInstance().useBlinkPlanner().build()
    //stream table env
    val tableEnv = StreamTableEnvironment.create(env, settings)

    //hive cataLog
    val hiveCatalogName: String = "kafka_hc"
    val defaultDatabasesName: String = "test"
    val hiveConfDir: String = "/alidata1/tmp/hivecf/"

    val cataLog: HiveCatalog = new HiveCatalog(hiveCatalogName, defaultDatabasesName, hiveConfDir)
    tableEnv.registerCatalog("kafka_hc", cataLog)

    tableEnv.useCatalog(hiveCatalogName)

    //获取kafka 流
    tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT)
    tableEnv.executeSql("drop table if exists KafkaSourceTable")

    //ts as to_timestamp(from_unixtime(tid/1000, 'yy-MM-dd HH:mm:ss'))

    val createTableSql: String =
      """
        |create table KafkaSourceTable(
        | vin String,
        | tid String,
        | source String,
        | datas String,
        | ts as substr(concat('20',tid),1,8 )
        | ) WITH (
        | 'connector' = 'kafka',
        | 'topic' = 'topic_c',
        | 'properties.bootstrap.servers' = 'ip:9092,ip2:9092,ip3:9092',
        | 'properties.group.id' = 'flink-test-group',
        | 'format' = 'json',
        | 'scan.startup.mode' = 'earliest-offset',
        | 'json.ignore-parse-errors' = 'true',
        | 'scan.startup.mode' = 'group-offsets'
        |)
        """.stripMargin

    /**
     * 'key.format' = 'json',
     * 'key.json.ignore-parse-errors' = 'true',
     * 'key.fields' = 'vin;tid;source;source',
     * 'value.format' = 'json',
     * 'value.json.fail-on-missing-field' = 'false',
     * 'value.fields-include' = 'ALL'
     */
    log.info(s"createTableSql: $createTableSql")
    tableEnv.executeSql(createTableSql)




//    //hvie cataLog
    tableEnv.useCatalog("kafka_hc")
    tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE)
    tableEnv.useDatabase("test")

//    val dropSql = ""

    val hiveSql: String =
      """
        |create table if not exists HiveSinkTable(
        | vin String,
        | tid String,
        | source String,
        | datas String )
        |  partitioned by (`date` String)
        | row format delimited fields terminated by '\t'
        |stored as orc
        |tblproperties (
        | 'orc.compress'='SNAPPY',
        | 'partition.time-extractor.timestamp-pattern' = '$date 00:00:00',
        | 'sink.partition-commit.trigger' = 'process-time',
        | 'sink.partition-commit.policy.kind' = 'metastore,success-file'
        |)
        |""".stripMargin

    tableEnv.executeSql(hiveSql)


    //切换方言
    tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT)

    val insertSql: String =
      """
        |insert into HiveSinkTable
        |select vin, tid, source, datas, from_unixtime(unix_timestamp(ts,'yyyyMMdd'),'yyyy-MM-dd') as `date`
        |from KafkaSourceTable
        |""".stripMargin
    //select vin, tid, sources, datas, date_format(ts, 'yyyy-MM-dd') as `date`
    log.info(s"insertSql: $insertSql")

    tableEnv.executeSql(insertSql)




//    env.execute("kafka to hive")

  }
}


Hive

在这里插入图片描述
在这里插入图片描述
tid 上传时间不标准,所以分区不准确。
在这里插入图片描述

首先,你需要在 Scala 代码中引入以下依赖: ```scala libraryDependencies += "org.apache.flink" %% "flink-scala" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-streaming-scala" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-connector-kafka" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-connector-hive" % flinkVersion libraryDependencies += "org.apache.flink" %% "flink-connector-jdbc" % flinkVersion ``` 然后,你可以使用以下代码来消费 Kafka 数据: ```scala import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer val env = StreamExecutionEnvironment.getExecutionEnvironment val kafkaConsumer = new FlinkKafkaConsumer[String]("topic", new SimpleStringSchema(), properties) val stream = env.addSource(kafkaConsumer) // 对数据进行处理 val result = stream.map(...) ``` 其中,`properties` 是一个 `Properties` 对象,用于配置 Kafka 的连接信息。 接下来,你需要将处理后的数据写入Hive 和 Doris 中。可以使用以下代码: ```scala import org.apache.flink.table.api.bridge.scala._ import org.apache.flink.table.catalog.hive.HiveCatalog import org.apache.flink.streaming.api.scala.StreamTableEnvironment val tableEnv = StreamTableEnvironment.create(env) val hiveCatalog = new HiveCatalog("myHiveCatalog", "myDatabase", "/path/to/hive/conf", "2.3.4") tableEnv.registerCatalog("myHiveCatalog", hiveCatalog) tableEnv.useCatalog("myHiveCatalog") tableEnv.executeSql("CREATE TABLE myHiveTable (...) WITH (...)") result.toTable(tableEnv, "myResultTable") tableEnv.executeSql("INSERT INTO myHiveTable SELECT * FROM myResultTable") val jdbcUrl = "jdbc:mysql://localhost:3306/my_database" tableEnv.executeSql(s"CREATE TABLE myDorisTable (...) WITH (...)") tableEnv.executeSql(s"INSERT INTO myDorisTable SELECT * FROM myResultTable") ``` 其中,`myHiveCatalog` 是 Hive 的 Catalog 名称,`myDatabase` 是 Hive 中的数据库名称,`/path/to/hive/conf` 是 Hive 的配置文件所在路径,`2.3.4` 是 Hive 的版本号。 `myHiveTable` 和 `myDorisTable` 是你要写入数据的表名,`(...)` 是表的列定义和其他属性,`myResultTable` 是处理后的数据表名。 `jdbcUrl` 是 Doris 数据库的连接信息,你需要根据实际情况进行修改。 你需要将上述代码中的 `...` 替换为实际的处理逻辑和表定义。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值