Flink 可以通过连接hive catalog的形式向hive写入数据。
重点!!
hive 必须有以下属性:
'transactional' = 'false'
'sink.partition-commit.policy.kind'='metastore,success-file'
下面是案例
package com.kkb.flink.stream;
import com.kkb.flink.stream.util.FlinkUtils;
import com.kkb.flink.stream.util.PropertiesReader;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author china_tao
* @since 2020-08-03 09:51
*/
public class KafkaToHiveCatlog1 {
final static Logger logger = LoggerFactory.getLogger(KafkaToHiveCatlog1.class);
public static void main(String[] args) throws Exception {
// if(args.length <1 ) {
// System.err.println("flink框架需要配置文件,请输入配置文件路径");
// return;
// }
//{"num":100,"statusKeyValueMap":{"f1":"stop","f2":20,"f3":444.444},"ts":1615022498959,"vin":"20200712082200002"}
String filePath = "catlog1.properties";
//初始化配置文件变量 init
PropertiesReader propertiesReader = new PropertiesReader(filePath);
String timecharacteristic = propertiesReader.getProperties("timecharacteristic");
boolean isDebug = Boolean.parseBoolean(propertiesReader.getProperties("isDebug"));
String kafkaTableName = propertiesReader.getProperties("kafkaTableName");
String createKafkaSourceSQL = propertiesReader.getProperties("createKafkaSourceSQL");
String kafkaTopic = propertiesReader.getProperties("kafka.topic");
String kafkaBootstrapServers = propertiesReader.getProperties("kafka.bootstrap.servers");
String kafkaGroupId = propertiesReader.getProperties("kafka.group.id");
String kafkaFormat = propertiesReader.getProperties("kafka.format");
String scanStartUpMode = propertiesReader.getProperties("scan.startup.mode");
String hiveTableName = propertiesReader.getProperties("hiveTableName");
String createHiveSinkSQL = propertiesReader.getProperties("createHiveSinkSQL");
String alterHiveTableSQL = propertiesReader.getProperties("alterHiveTableSQL");
String insertSQL = propertiesReader.getProperties("insertSQL");
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 判断项目的处理时间,EventTime or ProcessTime or IngestionTime
FlinkUtils.handelTimeCharacteristic(env,timecharacteristic);
env.enableCheckpointing(60 * 1000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(90 * 1000);
// env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
// env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
// env.getCheckpointConfig().setPreferCheckpointForRecovery(true);
// env.getCheckpointConfig().setTolerableCheckpointFailureNumber(5);
// RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration = new RestartStrategies.FailureRateRestartStrategyConfiguration(3, Time.minutes(5), Time.seconds(10));
// env.setRestartStrategy(restartStrategyConfiguration);
final EnvironmentSettings setting = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
final StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, setting);
//注册hive catalog
String catalogName = "my_catalog";
String defaultDatabase = "default";
HiveCatalog catalog = new HiveCatalog(
catalogName, // catalog name
defaultDatabase, // default database
"/etc/hive/conf", // Hive config (hive-site.xml) directory
"2.1.0" // Hive version
);
tableEnv.registerCatalog(catalogName, catalog);
tableEnv.useCatalog(catalogName);
//test.kafkaSourceTable
String kafkaDBname = kafkaTableName.split("\\.")[0];
String kafkaTbname = kafkaTableName.split("\\.")[1];
tableEnv.executeSql("CREATE DATABASE IF NOT EXISTS "+kafkaDBname);
//1.判断catlog中hivesinktable是否存在
String hiveDBname = hiveTableName.split("\\.")[0];
String hiveTbname = hiveTableName.split("\\.")[1];
ObjectPath hiveTbop = new ObjectPath(hiveDBname,hiveTbname);
if(isDebug){
System.out.println("删除hive表");
tableEnv.getCatalog(catalogName).get().dropTable(hiveTbop,true);
}
logger.info(hiveTbname+"是否存在: "+tableEnv.getCatalog(catalogName).get().tableExists(hiveTbop)+"");
boolean isHiveSinkTable =tableEnv.getCatalog(catalogName).get().tableExists(hiveTbop);
System.out.println("hive 表是否存在:"+isHiveSinkTable);
//2.删除catlog中kafkatable,每次重建,如果字段修改直接修改配置文件create语句,不需要alter语句
//if set to false, throw an exception,
//if set to true, do nothing
ObjectPath kafkaTableop = new ObjectPath(kafkaDBname,kafkaTbname);
tableEnv.getCatalog(catalogName).get().dropTable(kafkaTableop,true);
//3.重建catlog中kafkatable
final String createKafkaSourceTableSQL = FlinkUtils.getCreateKafkaSourceTableSQL(kafkaTopic,kafkaBootstrapServers,kafkaGroupId,kafkaFormat,scanStartUpMode,createKafkaSourceSQL);
tableEnv.executeSql(createKafkaSourceTableSQL);
//4.根据第一步结果,如果不存在则执行创建语句
System.out.println("创建 kafkatable "+kafkaTableName+" 完毕");
if(!isHiveSinkTable){
System.out.println("创建hive表");
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
tableEnv.executeSql(createHiveSinkSQL);
}
// List<TableColumn> cols = tableEnv.getCatalog(catalogName).get().getTable(hiveTbop).getSchema().getTableColumns();
// for (int j = 0; j <cols.size() ; j++) {
// System.out.println(hiveTableName+" col--->"+cols.get(j).getName());
// }
//
// List<TableColumn> kafkacols = tableEnv.getCatalog(catalogName).get().getTable(kafkaTableop).getSchema().getTableColumns();
// for (int j = 0; j <kafkacols.size() ; j++) {
// System.out.println(kafkaTableName+" col--->"+kafkacols.get(j).getName());
// }
//5.判断配置文件是否包含hive alter语句,如果有则执行。alter语句一定在建表后执行
if(alterHiveTableSQL!=null&& StringUtils.isNotBlank(alterHiveTableSQL)){
System.out.println("执行alert");
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE); //切换成hive的操作形式,这个时候就要使用hive的语法,如果是有flink的语法,会报错
tableEnv.executeSql(alterHiveTableSQL);
}
//6.执行查询sql并写入hive,查询kafkasourcetable的内容,得到查询结果,并准备插入到hive表中
System.out.println("insert");
tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT); //切换成普通的操作形式
tableEnv.executeSql(insertSQL);
}
}
/**
* 拼接用户配置文件中的kafkasourcetable的sql,加入connector等连接信息
* @param kafkaTopic
* @param kafkaBootstrapServers
* @param kafkaGroupId
* @param kafkaFormat
* @param scanStartUpMode
* @param createKafkaSourceSQL
* @return
*/
public static String getCreateKafkaSourceTableSQL(String kafkaTopic,String kafkaBootstrapServers,String kafkaGroupId,String kafkaFormat,String scanStartUpMode,String createKafkaSourceSQL) {
String resultCreateTableSql = createKafkaSourceSQL + " WITH ( "
+" 'connector' = 'kafka' ,"
+ " 'topic' = '" + kafkaTopic + "',"
+ " 'properties.bootstrap.servers' = '" + kafkaBootstrapServers + "',"
+ " 'properties.group.id' = '" + kafkaGroupId + "',"
+ " 'format' = '" + kafkaFormat + "',"
+ " 'scan.startup.mode' = '" + scanStartUpMode + "',"
+ " 'json.fail-on-missing-field' = 'false',"
+ " 'json.ignore-parse-errors' = 'true' )";
return resultCreateTableSql;
}
下面是配置文件
# eventTime ingestionTime processingTime 注意:只能选择这3个值中的一个,注意大小写
timecharacteristic=eventTime
#开发阶段为true,生产环境为false。如果设置为true,每次重启时,会删除上一次建立的表,方便调试。如果设置为false,则不会删除上一次的表
isDebug=false
#databaseName=
####################################kafka相关配置#####################################
# 自定义kafka数据源的名称,db.table格式,不能在default库创建
kafkaTableName=test.kafkaSourceTable
#kafka表结构,对象结构使用ROW来表示,后续通过statusKeyValueMap.f1即可取到对应的值
#最后3列,rowtime是设定msgtime为eventtime,同时生成proceetime,列名为pts,最后的watermark为防止数据乱序,开了1秒钟(这个时间,用户可配)
createKafkaSourceSQL=CREATE TABLE test.kafkaSourceTable( \
`num` INT, \
`statusKeyValueMap` ROW(`f1` STRING,`f2` INT,`f3` STRING), \
`rowtime` as TO_TIMESTAMP(FROM_UNIXTIME(`ts`/1000,'yyyy-MM-dd HH:mm:ss')), \
`vin` STRING,\
`ts` BIGINT, \
`pts` AS PROCTIME(), \
WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND)
#kafka的topic
kafka.topic=veche
#kafka的bootstrap
kafka.bootstrap.servers=hadoop1:9092,hadoop2:9092,hadoop3:9092
#kafka消费者组ID
kafka.group.id=kkb
#kakfa数据格式类型
kafka.format=json
#消费kafka offset起始位置 最新位置
scan.startup.mode=latest-offset
####################################hive相关配置#####################################
hiveTableName=test_datax.ods_veh_vehicle_status_his_rt
createHiveSinkSQL= CREATE TABLE test_datax.ods_veh_vehicle_status_his_rt ( \
`num` INT,\
`f1` STRING, \
`f2` INT, \
`f3` STRING, \
`vin` STRING,\
`ts` BIGINT ) \
PARTITIONED BY (dt STRING, hr STRING) STORED AS orc TBLPROPERTIES ( \
'sink.partition-commit.trigger'='partition-time' , \
'transactional' = 'false' , \
'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00' , \
'sink.partition-commit.policy.kind'='metastore,success-file')
alterHiveTableSQL=
####################################Insert 操作####################################
insertSQL=INSERT INTO test_datax.ods_veh_vehicle_status_his_rt SELECT num,statusKeyValueMap.f1 as f1,statusKeyValueMap.f2 as f2 ,\
statusKeyValueMap.f3 as f3 , vin ,ts, \
DATE_FORMAT(rowtime, 'yyyy-MM-dd'), DATE_FORMAT(rowtime, 'HH') FROM test.kafkaSourceTable
<dependencies>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<!-- Flink SQL dependencies -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<!-- Add connector dependencies here. They must be in the default scope (compile). -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-sql-connector-kafka_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_2.11</artifactId>
<version>${flink.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.flink</groupId>-->
<!--<artifactId>flink-shaded-hadoop-3</artifactId>-->
<!--<version>3.1.1.7.1.1.0-565-9.0</version>-->
<!--<scope>provided</scope> -->
<!--</dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-orc_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>1.0.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.18</version>
</dependency>
<!-- Add logging framework, to produce console output when running in the IDE. -->
<!-- These dependencies are excluded from the application JAR by default. -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.72</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.18</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.flink</groupId>-->
<!--<artifactId>flink-shaded-hadoop-2-uber</artifactId>-->
<!--<version>${flink-shaded-hadoop.version}</version>-->
<!--</dependency>-->
</dependencies>