Flink SQL 以catalog方式写入HIVE

最新推荐文章于 2024-09-12 12:59:21 发布

lvwenyuan_1

最新推荐文章于 2024-09-12 12:59:21 发布

阅读量2.7k

点赞数 1

分类专栏： flink hive 文章标签： flinkSQL catalog

本文链接：https://blog.csdn.net/lvwenyuan_1/article/details/114696687

版权

flink 同时被 2 个专栏收录

21 篇文章 8 订阅

订阅专栏

hive

4 篇文章 0 订阅

订阅专栏

Flink 可以通过连接hive catalog的形式向hive写入数据。

重点！！

hive 必须有以下属性：

'transactional' = 'false' 
'sink.partition-commit.policy.kind'='metastore,success-file'

下面是案例

package com.kkb.flink.stream;

import com.kkb.flink.stream.util.FlinkUtils;
import com.kkb.flink.stream.util.PropertiesReader;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * @author china_tao
 * @since 2020-08-03 09:51
 */
public class KafkaToHiveCatlog1 {
    final static Logger logger = LoggerFactory.getLogger(KafkaToHiveCatlog1.class);

    public static void main(String[] args) throws Exception {
//        if(args.length <1 ) {
//            System.err.println("flink框架需要配置文件，请输入配置文件路径");
//            return;
//        }

        //{"num":100,"statusKeyValueMap":{"f1":"stop","f2":20,"f3":444.444},"ts":1615022498959,"vin":"20200712082200002"}
        String filePath = "catlog1.properties";
        //初始化配置文件变量 init
        PropertiesReader propertiesReader = new PropertiesReader(filePath);
        String timecharacteristic = propertiesReader.getProperties("timecharacteristic");
        boolean isDebug = Boolean.parseBoolean(propertiesReader.getProperties("isDebug"));
        String kafkaTableName = propertiesReader.getProperties("kafkaTableName");
        String createKafkaSourceSQL = propertiesReader.getProperties("createKafkaSourceSQL");
        String kafkaTopic = propertiesReader.getProperties("kafka.topic");
        String kafkaBootstrapServers = propertiesReader.getProperties("kafka.bootstrap.servers");
        String kafkaGroupId = propertiesReader.getProperties("kafka.group.id");
        String kafkaFormat = propertiesReader.getProperties("kafka.format");
        String scanStartUpMode = propertiesReader.getProperties("scan.startup.mode");
        String hiveTableName = propertiesReader.getProperties("hiveTableName");
        String createHiveSinkSQL = propertiesReader.getProperties("createHiveSinkSQL");
        String alterHiveTableSQL = propertiesReader.getProperties("alterHiveTableSQL");
        String insertSQL = propertiesReader.getProperties("insertSQL");


        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        // 判断项目的处理时间，EventTime or ProcessTime or IngestionTime
        FlinkUtils.handelTimeCharacteristic(env,timecharacteristic);
        env.enableCheckpointing(60 * 1000, CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setCheckpointTimeout(90 * 1000);
//        env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
//        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
//        env.getCheckpointConfig().setPreferCheckpointForRecovery(true);
//        env.getCheckpointConfig().setTolerableCheckpointFailureNumber(5);
//        RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration = new RestartStrategies.FailureRateRestartStrategyConfiguration(3, Time.minutes(5), Time.seconds(10));
//        env.setRestartStrategy(restartStrategyConfiguration);

        final EnvironmentSettings setting = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
        final StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, setting);

        //注册hive catalog
        String catalogName = "my_catalog";
        String defaultDatabase = "default";
        HiveCatalog catalog = new HiveCatalog(
                catalogName,              // catalog name
                defaultDatabase,                // default database
                "/etc/hive/conf",  // Hive config (hive-site.xml) directory
                "2.1.0"                   // Hive version
        );
        tableEnv.registerCatalog(catalogName, catalog);
        tableEnv.useCatalog(catalogName);

        //test.kafkaSourceTable
        String kafkaDBname = kafkaTableName.split("\\.")[0];
        String kafkaTbname = kafkaTableName.split("\\.")[1];
        tableEnv.executeSql("CREATE DATABASE IF NOT EXISTS "+kafkaDBname);

        //1.判断catlog中hivesinktable是否存在
        String hiveDBname = hiveTableName.split("\\.")[0];
        String hiveTbname = hiveTableName.split("\\.")[1];

        ObjectPath hiveTbop = new ObjectPath(hiveDBname,hiveTbname);
        if(isDebug){
            System.out.println("删除hive表");
            tableEnv.getCatalog(catalogName).get().dropTable(hiveTbop,true);
        }
        logger.info(hiveTbname+"是否存在：  "+tableEnv.getCatalog(catalogName).get().tableExists(hiveTbop)+"");
        boolean isHiveSinkTable =tableEnv.getCatalog(catalogName).get().tableExists(hiveTbop);


        System.out.println("hive 表是否存在："+isHiveSinkTable);
        //2.删除catlog中kafkatable，每次重建，如果字段修改直接修改配置文件create语句，不需要alter语句
        //if set to false, throw an exception,
	    //if set to true, do nothing
        ObjectPath kafkaTableop = new ObjectPath(kafkaDBname,kafkaTbname);
        tableEnv.getCatalog(catalogName).get().dropTable(kafkaTableop,true);
        //3.重建catlog中kafkatable
        final String createKafkaSourceTableSQL = FlinkUtils.getCreateKafkaSourceTableSQL(kafkaTopic,kafkaBootstrapServers,kafkaGroupId,kafkaFormat,scanStartUpMode,createKafkaSourceSQL);
        tableEnv.executeSql(createKafkaSourceTableSQL);
        //4.根据第一步结果，如果不存在则执行创建语句
        System.out.println("创建 kafkatable "+kafkaTableName+" 完毕");

        if(!isHiveSinkTable){
            System.out.println("创建hive表");
            tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
            tableEnv.executeSql(createHiveSinkSQL);
        }

//        List<TableColumn> cols = tableEnv.getCatalog(catalogName).get().getTable(hiveTbop).getSchema().getTableColumns();
//        for (int j = 0; j <cols.size() ; j++) {
//            System.out.println(hiveTableName+" col--->"+cols.get(j).getName());
//        }
//
//        List<TableColumn> kafkacols = tableEnv.getCatalog(catalogName).get().getTable(kafkaTableop).getSchema().getTableColumns();
//        for (int j = 0; j <kafkacols.size() ; j++) {
//            System.out.println(kafkaTableName+" col--->"+kafkacols.get(j).getName());
//        }

        //5.判断配置文件是否包含hive alter语句，如果有则执行。alter语句一定在建表后执行

        if(alterHiveTableSQL!=null&& StringUtils.isNotBlank(alterHiveTableSQL)){
            System.out.println("执行alert");
            tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE); //切换成hive的操作形式,这个时候就要使用hive的语法，如果是有flink的语法，会报错
            tableEnv.executeSql(alterHiveTableSQL);
        }


        //6.执行查询sql并写入hive，查询kafkasourcetable的内容，得到查询结果，并准备插入到hive表中
        System.out.println("insert");
        tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT); //切换成普通的操作形式
        tableEnv.executeSql(insertSQL);

    }
}

    /**
     * 拼接用户配置文件中的kafkasourcetable的sql，加入connector等连接信息
     * @param kafkaTopic
     * @param kafkaBootstrapServers
     * @param kafkaGroupId
     * @param kafkaFormat
     * @param scanStartUpMode
     * @param createKafkaSourceSQL
     * @return
     */
    public static String getCreateKafkaSourceTableSQL(String kafkaTopic,String kafkaBootstrapServers,String kafkaGroupId,String kafkaFormat,String scanStartUpMode,String createKafkaSourceSQL) {

        String resultCreateTableSql = createKafkaSourceSQL + " WITH ( "
                +" 'connector' = 'kafka' ,"
                + " 'topic' = '" + kafkaTopic + "',"
                + " 'properties.bootstrap.servers' = '" + kafkaBootstrapServers + "',"
                + " 'properties.group.id' = '" + kafkaGroupId + "',"
                + " 'format' = '" + kafkaFormat + "',"
                + " 'scan.startup.mode' = '" + scanStartUpMode + "',"
                + " 'json.fail-on-missing-field' = 'false',"
                + " 'json.ignore-parse-errors' = 'true' )";

        return resultCreateTableSql;
    }

下面是配置文件

# eventTime ingestionTime processingTime 注意：只能选择这3个值中的一个，注意大小写
timecharacteristic=eventTime

#开发阶段为true，生产环境为false。如果设置为true，每次重启时，会删除上一次建立的表,方便调试。如果设置为false，则不会删除上一次的表
isDebug=false
#databaseName=
####################################kafka相关配置#####################################
# 自定义kafka数据源的名称,db.table格式，不能在default库创建
kafkaTableName=test.kafkaSourceTable
#kafka表结构，对象结构使用ROW来表示，后续通过statusKeyValueMap.f1即可取到对应的值
#最后3列，rowtime是设定msgtime为eventtime，同时生成proceetime，列名为pts，最后的watermark为防止数据乱序，开了1秒钟(这个时间，用户可配)
createKafkaSourceSQL=CREATE TABLE test.kafkaSourceTable( \
`num` INT, \
`statusKeyValueMap` ROW(`f1` STRING,`f2` INT,`f3` STRING), \
`rowtime` as TO_TIMESTAMP(FROM_UNIXTIME(`ts`/1000,'yyyy-MM-dd HH:mm:ss')), \
`vin` STRING,\
`ts`  BIGINT, \
`pts` AS PROCTIME(), \
 WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND)


#kafka的topic
kafka.topic=veche
#kafka的bootstrap
kafka.bootstrap.servers=hadoop1:9092,hadoop2:9092,hadoop3:9092
#kafka消费者组ID
kafka.group.id=kkb
#kakfa数据格式类型
kafka.format=json
#消费kafka offset起始位置 最新位置
scan.startup.mode=latest-offset

####################################hive相关配置#####################################
hiveTableName=test_datax.ods_veh_vehicle_status_his_rt
createHiveSinkSQL= CREATE TABLE  test_datax.ods_veh_vehicle_status_his_rt ( \
`num` INT,\
`f1` STRING, \
`f2` INT, \
`f3` STRING, \
`vin` STRING,\
`ts` BIGINT ) \
PARTITIONED BY (dt STRING, hr STRING) STORED AS orc TBLPROPERTIES ( \
'sink.partition-commit.trigger'='partition-time' , \
'transactional' = 'false' , \
'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00' , \
'sink.partition-commit.policy.kind'='metastore,success-file')

alterHiveTableSQL=
####################################Insert 操作####################################
insertSQL=INSERT INTO test_datax.ods_veh_vehicle_status_his_rt SELECT num,statusKeyValueMap.f1 as f1,statusKeyValueMap.f2 as f2 ,\
statusKeyValueMap.f3 as f3 , vin ,ts, \
DATE_FORMAT(rowtime, 'yyyy-MM-dd'), DATE_FORMAT(rowtime, 'HH')  FROM test.kafkaSourceTable

   <dependencies>
        <!-- Apache Flink dependencies -->
        <!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
             <scope>provided</scope>

        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
             <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
             <scope>provided</scope>
        </dependency>

        <!-- Flink SQL dependencies -->

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java-bridge_2.11</artifactId>
            <version>${flink.version}</version>
             <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_2.11</artifactId>
            <version>${flink.version}</version>
             <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.11</artifactId>
            <version>${flink.version}</version>
             <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-common</artifactId>
            <version>${flink.version}</version>
             <scope>provided</scope>
        </dependency>
        <!-- Add connector dependencies here. They must be in the default scope (compile). -->


        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-sql-connector-kafka_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-json</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-hive_2.11</artifactId>
            <version>${flink.version}</version>
<!--             <scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
<!--             <scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
             <scope>provided</scope>
        </dependency>
        <!--<dependency>-->
        <!--<groupId>org.apache.flink</groupId>-->
        <!--<artifactId>flink-shaded-hadoop-3</artifactId>-->
        <!--<version>3.1.1.7.1.1.0-565-9.0</version>-->
        <!--<scope>provided</scope> -->
        <!--</dependency>-->


        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-parquet_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-orc_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>1.0.1</version>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.16.18</version>
        </dependency>


        <!-- Add logging framework, to produce console output when running in the IDE. -->
        <!-- These dependencies are excluded from the application JAR by default. -->
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-slf4j-impl</artifactId>
            <version>${log4j.version}</version>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-api</artifactId>
            <version>${log4j.version}</version>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>${log4j.version}</version>
            <scope>runtime</scope>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.72</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.18</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-jdbc_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <!--<dependency>-->
        <!--<groupId>org.apache.flink</groupId>-->
        <!--<artifactId>flink-shaded-hadoop-2-uber</artifactId>-->
        <!--<version>${flink-shaded-hadoop.version}</version>-->
        <!--</dependency>-->
    </dependencies>