参考文档
环境
flink 1.13.1
java 1.8
scala 2.11
hive 2.1.1-cdh6.1.1
hadoop 3.0.0-cdh6.1.1
pom
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<flink.version>1.13.1</flink.version>
<scala.version>2.11</scala.version>
<!-- <hive.version>2.1.1</hive.version>-->
<hive.version>2.1.1-cdh6.1.1</hive.version>
<!-- <hadoop.version>3.0.0</hadoop.version>-->
<hadoop.version>3.0.0-cdh6.1.1</hadoop.version>
<log4j.version>2.8.2</log4j.version>
</properties>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!--<!– <artifactId>flink-sql-connector-hive-${hive.version}_${scala.version}</artifactId>–>-->
<!-- <artifactId>flink-sql-connector-hive-2.2.0_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<!-- <repository>-->
<!-- <id>spring-plugin</id>-->
<!-- <url>https://repo.spring.io/plugins-release/</url>-->
<!-- </repository>-->
</repositories>
<!-- <dependencies>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-scala_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-streaming-scala_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <exclusions>-->
<!-- <exclusion>-->
<!-- <artifactId>commons-cli</artifactId>-->
<!-- <groupId>commons-cli</groupId>-->
<!-- </exclusion>-->
<!-- <exclusion>-->
<!-- <artifactId>commons-io</artifactId>-->
<!-- <groupId>commons-io</groupId>-->
<!-- </exclusion>-->
<!-- </exclusions>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>commons-cli</groupId>-->
<!-- <artifactId>commons-cli</artifactId>-->
<!-- <version>1.4</version>-->
<!-- </dependency>-->
<!-- <!– Flink Dependency –>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-connector-hive_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-table-api-java-bridge_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<!-- <!– Hive Dependency –>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.hive</groupId>-->
<!-- <artifactId>hive-exec</artifactId>-->
<!-- <version>${hive.version}</version>-->
<!-- <exclusions>-->
<!-- <exclusion>-->
<!-- <artifactId>commons-cli</artifactId>-->
<!-- <groupId>commons-cli</groupId>-->
<!-- </exclusion>-->
<!-- <exclusion>-->
<!-- <artifactId>protobuf-java</artifactId>-->
<!-- <groupId>com.google.protobuf</groupId>-->
<!-- </exclusion>-->
<!-- </exclusions>-->
<!-- <!– <scope>provided</scope>–>-->
<!-- </dependency>-->
<!-- </dependencies>-->
<dependencies>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-scala_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.version}</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<!-- <artifactId>flink-table-api-java-bridge_2.11</artifactId>-->
<artifactId>flink-table-api-scala-bridge_2.11</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<!-- <artifactId>flink-table-api-scala-bridge_2.11</artifactId>-->
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-streaming-scala_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.version}</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- Flink SQL dependencies -->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!--<!– <artifactId>flink-table-api-java-bridge_2.11</artifactId>–>-->
<!-- <artifactId>flink-table-api-scala-bridge_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-table-planner_${scala.version}</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- <exclusions>-->
<!-- <exclusion>-->
<!-- <artifactId>slf4j-api</artifactId>-->
<!-- <groupId>org.slf4j</groupId>-->
<!-- </exclusion>-->
<!-- </exclusions>-->
<!-- </dependency>-->
<!-- https://mvnrepository.com/artifact/com.ibm/com.ibm.icu -->
<!-- https://mvnrepository.com/artifact/com.ibm/com.ibm.icu -->
<!-- <dependency>-->
<!-- <groupId>com.ibm</groupId>-->
<!-- <artifactId>com.ibm.icu</artifactId>-->
<!-- <version>3.6.1.v20070906</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-runtime-blink -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime-blink_${scala.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-streaming-scala_2.11</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<!-- https://mvnrepository.com/artifact/org.apache.calcite/calcite-core -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<!-- Add connector dependencies here. They must be in the default scope (compile). -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sql-connector-kafka_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_2.11</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<!-- <version>2.1.1-cdh6.1.1</version>-->
<exclusions>
<exclusion>
<artifactId>calcite-avatica</artifactId>
<groupId>org.apache.calcite</groupId>
</exclusion>
<exclusion>
<artifactId>calcite-core</artifactId>
<groupId>org.apache.calcite</groupId>
</exclusion>
<exclusion>
<artifactId>calcite-linq4j</artifactId>
<groupId>org.apache.calcite</groupId>
</exclusion>
</exclusions>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.flink</groupId>-->
<!--<artifactId>flink-shaded-hadoop-3</artifactId>-->
<!--<version>3.1.1.7.1.1.0-565-9.0</version>-->
<!--<scope>provided</scope> -->
<!--</dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-orc_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-base</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.kafka</groupId>-->
<!-- <artifactId>kafka-clients</artifactId>-->
<!-- <version>1.0.1</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.projectlombok</groupId>-->
<!-- <artifactId>lombok</artifactId>-->
<!-- <version>1.16.18</version>-->
<!-- </dependency>-->
<!-- Add logging framework, to produce console output when running in the IDE.-->
<!-- These dependencies are excluded from the application JAR by default.-->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<!-- <dependency>-->
<!-- <groupId>com.alibaba</groupId>-->
<!-- <artifactId>fastjson</artifactId>-->
<!-- <version>1.2.72</version>-->
<!-- </dependency>-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.46</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.flink</groupId>-->
<!--<artifactId>flink-shaded-hadoop-2-uber</artifactId>-->
<!--<version>${flink-shaded-hadoop.version}</version>-->
<!--</dependency>-->
</dependencies>
flink lib 文件
antlr-runtime-3.5.2.jar
com.ibm.icu-4.4.2.jar
commons-cli-1.4.jar
flink-connector-hive_2.11-1.13.1.jar
flink-connector-kafka_2.11-1.13.1.jar
flink-core-1.13.0.jar
flink-csv-1.13.1.jar
flink-dist_2.11-1.13.1.jar
flink-json-1.13.1.jar
flink-shaded-hadoop-2-uber-2.7.5-7.0.jar
flink-shaded-hadoop-2-uber-3.0.0-cdh6.3.0-7.0.jar
flink-shaded-zookeeper-3.4.14.jar
flink-sql-connector-hive-2.2.0_2.11-1.13.3.jar
flink-table-api-java-1.13.1.jar
flink-table-api-java-bridge_2.11-1.13.1.jar.0
flink-table-api-scala_2.11-1.13.1.jar.0
flink-table-planner_2.11-1.13.1.jar
flink-table-planner-blink_2.11-1.13.1.jar
flink-table-runtime-blink_2.11-1.13.1.jar
jindo-flink-sink-3.6.1.jar
jindofs-sdk-3.6.1.jar
log4j-1.2-api-2.12.1.jar
log4j-api-2.12.1.jar
log4j-core-2.12.1.jar
log4j-slf4j-impl-2.12.1.jar
Demo
package com.gwm.core.kafka
import org.apache.flink.api.scala._
import org.apache.flink.table.api._
import org.apache.flink.table.api.bridge.scala._
import com.gwm.utils.log.Logging
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect}
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.flink.table.catalog.hive.HiveCatalog
/**
* @Author:
* @Description: sth todo
*/
object TestFlinkKafka extends Logging{
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//checkpoint
env.enableCheckpointing(1000 * 60,CheckpointingMode.EXACTLY_ONCE)
env.getCheckpointConfig.setCheckpointStorage("hdfs://test/checkpoint/")
// val env = StreamExecutionEnvironment.getExecutionEnvironment.enableCheckpointing(5 * 60 * 1000)
// val checkpointConfig = env.getCheckpointConfigcheckpointConfig.setMinPauseBetweenCheckpoints(2 * 60 * 1000)
// checkpointConfig.setCheckpointTimeout(3 * 60 * 1000)checkpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
//
val settings = EnvironmentSettings.newInstance().useBlinkPlanner().build()
//stream table env
val tableEnv = StreamTableEnvironment.create(env, settings)
//hive cataLog
val hiveCatalogName: String = "kafka_hc"
val defaultDatabasesName: String = "test"
val hiveConfDir: String = "/alidata1/tmp/hivecf/"
val cataLog: HiveCatalog = new HiveCatalog(hiveCatalogName, defaultDatabasesName, hiveConfDir)
tableEnv.registerCatalog("kafka_hc", cataLog)
tableEnv.useCatalog(hiveCatalogName)
//获取kafka 流
tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT)
tableEnv.executeSql("drop table if exists KafkaSourceTable")
//ts as to_timestamp(from_unixtime(tid/1000, 'yy-MM-dd HH:mm:ss'))
val createTableSql: String =
"""
|create table KafkaSourceTable(
| vin String,
| tid String,
| source String,
| datas String,
| ts as substr(concat('20',tid),1,8 )
| ) WITH (
| 'connector' = 'kafka',
| 'topic' = 'topic_c',
| 'properties.bootstrap.servers' = 'ip:9092,ip2:9092,ip3:9092',
| 'properties.group.id' = 'flink-test-group',
| 'format' = 'json',
| 'scan.startup.mode' = 'earliest-offset',
| 'json.ignore-parse-errors' = 'true',
| 'scan.startup.mode' = 'group-offsets'
|)
""".stripMargin
/**
* 'key.format' = 'json',
* 'key.json.ignore-parse-errors' = 'true',
* 'key.fields' = 'vin;tid;source;source',
* 'value.format' = 'json',
* 'value.json.fail-on-missing-field' = 'false',
* 'value.fields-include' = 'ALL'
*/
log.info(s"createTableSql: $createTableSql")
tableEnv.executeSql(createTableSql)
// //hvie cataLog
tableEnv.useCatalog("kafka_hc")
tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE)
tableEnv.useDatabase("test")
// val dropSql = ""
val hiveSql: String =
"""
|create table if not exists HiveSinkTable(
| vin String,
| tid String,
| source String,
| datas String )
| partitioned by (`date` String)
| row format delimited fields terminated by '\t'
|stored as orc
|tblproperties (
| 'orc.compress'='SNAPPY',
| 'partition.time-extractor.timestamp-pattern' = '$date 00:00:00',
| 'sink.partition-commit.trigger' = 'process-time',
| 'sink.partition-commit.policy.kind' = 'metastore,success-file'
|)
|""".stripMargin
tableEnv.executeSql(hiveSql)
//切换方言
tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT)
val insertSql: String =
"""
|insert into HiveSinkTable
|select vin, tid, source, datas, from_unixtime(unix_timestamp(ts,'yyyyMMdd'),'yyyy-MM-dd') as `date`
|from KafkaSourceTable
|""".stripMargin
//select vin, tid, sources, datas, date_format(ts, 'yyyy-MM-dd') as `date`
log.info(s"insertSql: $insertSql")
tableEnv.executeSql(insertSql)
// env.execute("kafka to hive")
}
}
Hive
tid 上传时间不标准,所以分区不准确。