flink数据写入hive实现demo

最新推荐文章于 2022-08-02 17:14:24 发布

꧁꫞ND꫞꧂

最新推荐文章于 2022-08-02 17:14:24 发布

阅读量2.2k

点赞数

分类专栏： Flink 文章标签： hive

本文链接：https://blog.csdn.net/Baron_ND/article/details/110376212

版权

首先呢，flink写入hive已经在1.10就实现了，但是我们这么用呢，其实是大多数公司不得已的情况，也是慢慢转型而来的一个适中的使用情况，Apache也为我们考虑提供了支持，帮我们再分布式环境、流计算的今天提供了更好的帮助。感谢这些社区贡献者和大佬的们研究分享。以下是实现的一个小demo，大家共同分析学习，有问题交流学习。

注意： 在本地环境读数据要是可以的话，写数据就一定可以的。写的时候需要注意服务器上的环境，主要是权限和jar依赖。

1.代码实现

1.1使用tableEvironment读取catlog配置，然后sql操作hive

1.先来个最基本的测试demo，测试通过在看后面的，这个是从hive表里读取数据然后写入hive

package flink.java.connector.hive.write;
import flink.java.utils.HiveResourceInfo;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.connector.jdbc.JdbcInputFormat;
import org.apache.flink.table.api.*;
import org.apache.flink.table.api.bridge.java.BatchTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.apache.flink.types.Row;


import java.util.List;
import java.util.Properties;

import static org.apache.flink.table.api.Expressions.$;


public class Hive2Hive {
    public static void main(String[] args) throws Exception {

        EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build();
        TableEnvironment tableEnv = TableEnvironment.create(settings);
        Properties hiveConf = HiveResourceInfo.getHiveConf();

        HiveCatalog hive = new HiveCatalog(hiveConf.getProperty("CATALOG.NAME"),
                hiveConf.getProperty("CATALOG.DB"),
                hiveConf.getProperty("CATALOG.HIVECONFDIR"));
        tableEnv.registerCatalog("myhive", hive);
        tableEnv.useCatalog("myhive");
        tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
        String hiveSql = "create external table  fs_tables (" +
                "  id string," +
                "  name String" +
                ") partitioned by (dt string) " +
                "stored as orc " +
                "tblproperties (" +
                "  'partition.time-extractor.timestamp-pattern'='$dt'," +
                "  'sink.partition-commit.delay'='0s'," +
                "  'sink.partition-commit.trigger'='partition-time'," +
                "  'sink.partition-commit.policy.kind'='metastore'" +
                ")";
        tableEnv.executeSql(hiveSql);
//        Table result = table.groupBy($("age")).select($("id"), $("name"), $("age"));
//        Table table = tableEnv.sqlQuery("select id,name,age from test.hive_test7  where age > 18");

        tableEnv.executeSql("insert into  fs_tables select id,name,age from test.hive_test7 where age > 18 ");

    }

}

1.2使用jdbc的方式读取hive的dataset数据，然后使用tableEvironment读取catlog配置，写入hive

注意，我们不能直接使用batchTableEnvironment的方式，将批数据转成table然后使用table的API直接insert到hive，这两种方式的table是不一样的，如果是使用batchTab ,那么久必须使用batchTableSink或者OutputFormatTableSink 。

所以，如果我们是读出来的批数据，如果是数据量不是很大，那么可以直接放到一个集合里，然后使用table加载到表中，然后使用tableEnvrionment的方式去操作，很香；如果数据量很大那么使用流的方式去处理吧，每个版本都有很大的变化，在flink1.10才是加入对hive的DDL DML，我们是基于1.11.1，所以可以结合当前版本和官网的说明自己来根据当前业务来定义。

public static void main(String[] args) throws Exception {

        EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build();
        TableEnvironment tableEnv = TableEnvironment.create(settings);
        Properties hiveConf = HiveResourceInfo.getHiveConf();
        //读取catalog配置
        HiveCatalog hive = new HiveCatalog(
                hiveConf.getProperty("CATALOG.NAME"),
                hiveConf.getProperty("CATALOG.DB"),
                hiveConf.getProperty("CATALOG.HIVECONFDIR")
        );

        tableEnv.registerCatalog("myhive", hive);
        tableEnv.useCatalog("myhive");
        tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
        //创建落地表
        String hiveSql = "create external table  fs_tables (" +
                "  id string," +
                "  name String" +
                ") partitioned by (dt string) " +
                "stored as orc " +
                "tblproperties (" +
                "  'partition.time-extractor.timestamp-pattern'='$dt'," +
                "  'sink.partition-commit.delay'='0s'," +
                "  'sink.partition-commit.trigger'='partition-time'," +
                "  'sink.partition-commit.policy.kind'='metastore'" +
                ")";
        tableEnv.executeSql(hiveSql);
//        Table result = table.groupBy($("age")).select($("id"), $("name"), $("age"));
//        Table table = tableEnv.sqlQuery("select id,name,age from test.hive_test7  where age > 18");
        tableEnv.executeSql("SELECT * FROM Orders");

        //落地数据
        tableEnv.executeSql("insert into  fs_tables select id,name,age from test.hive_test7 where age > 18 ");

配置文件的加载

package flink.java.utils;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Objects;
import java.util.Properties;

public class HiveResourceInfo {

    private static HiveResourceInfo resourceInfo=null;
    private static Properties prop=null;
    private HiveResourceInfo(){}

    public static synchronized Properties getHiveConf() throws Exception {
        if(null == resourceInfo){
            resourceInfo=new HiveResourceInfo();
            InputStream inputStream = HiveResourceInfo.class
                    .getClassLoader()
                    .getResourceAsStream("hiveInfo/hiveConf.properties");

            if (Objects.isNull(inputStream)) {
                throw new Exception("can not read connInfo/hive/hiveConf.properties");
            }
            if(null == prop){
                prop=new Properties();
            }
            pro

最低0.47元/天解锁文章

꧁꫞ND꫞꧂

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
打赏
1
评论
flink数据写入hive实现demo

首先呢，flink写入hive已经在1.10就实现了，但是我们这么用呢，其实是大多数公司不得已的情况，也是慢慢转型而来的一个适中的使用情况，Apache也为我们考虑提供了支持，帮我们再分布式环境、流计算的今天提供了更好的帮助。感谢这些社区贡献者和大佬的们研究分享。以下是实现的一个小demo，大家共同分析学习，有问题交流学习。注意：在本地环境读数据要是可以的话，写数据就一定可以的。写的时候需要注意服务器上的环境，主要是权限和jar依赖。1.代码实现1.先来个最基本的测试demo，...
复制链接

扫一扫