首先呢,flink写入hive已经在1.10就实现了,但是我们这么用呢,其实是大多数公司不得已的情况,也是慢慢转型而来的一个适中的使用情况,Apache也为我们考虑提供了支持,帮我们再分布式环境、流计算的今天提供了更好的帮助。感谢这些社区贡献者和大佬的们研究分享。以下是实现的一个小demo,大家共同分析学习,有问题交流学习。
注意: 在本地环境读数据要是可以的话,写数据就一定可以的。写的时候需要注意服务器上的环境,主要是权限和jar依赖。
1.代码实现
1.1使用tableEvironment读取catlog配置,然后sql操作hive
1.先来个最基本的测试demo,测试通过在看后面的,这个是从hive表里读取数据然后写入hive
package flink.java.connector.hive.write;
import flink.java.utils.HiveResourceInfo;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.connector.jdbc.JdbcInputFormat;
import org.apache.flink.table.api.*;
import org.apache.flink.table.api.bridge.java.BatchTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import org.apache.flink.types.Row;
import java.util.List;
import java.util.Properties;
import static org.apache.flink.table.api.Expressions.$;
public class Hive2Hive {
public static void main(String[] args) throws Exception {
EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build();
TableEnvironment tableEnv = TableEnvironment.create(settings);
Properties hiveConf = HiveResourceInfo.getHiveConf();
HiveCatalog hive = new HiveCatalog(hiveConf.getProperty("CATALOG.NAME"),
hiveConf.getProperty("CATALOG.DB"),
hiveConf.getProperty("CATALOG.HIVECONFDIR"));
tableEnv.registerCatalog("myhive", hive);
tableEnv.useCatalog("myhive");
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
String hiveSql = "create external table fs_tables (" +
" id string," +
" name String" +
") partitioned by (dt string) " +
"stored as orc " +
"tblproperties (" +
" 'partition.time-extractor.timestamp-pattern'='$dt'," +
" 'sink.partition-commit.delay'='0s'," +
" 'sink.partition-commit.trigger'='partition-time'," +
" 'sink.partition-commit.policy.kind'='metastore'" +
")";
tableEnv.executeSql(hiveSql);
// Table result = table.groupBy($("age")).select($("id"), $("name"), $("age"));
// Table table = tableEnv.sqlQuery("select id,name,age from test.hive_test7 where age > 18");
tableEnv.executeSql("insert into fs_tables select id,name,age from test.hive_test7 where age > 18 ");
}
}
1.2使用jdbc的方式读取hive的dataset数据,然后使用tableEvironment读取catlog配置,写入hive
注意,我们不能直接使用batchTableEnvironment的方式,将批数据转成table然后使用table的API直接insert到hive,这两种方式的table是不一样的,如果是使用batchTab ,那么久必须使用batchTableSink或者OutputFormatTableSink 。
所以,如果我们是读出来的批数据,如果是数据量不是很大,那么可以直接放到一个集合里,然后使用table加载到表中,然后使用tableEnvrionment的方式去操作,很香;如果数据量很大那么使用流的方式去处理吧,每个版本都有很大的变化,在flink1.10才是加入对hive的DDL DML,我们是基于1.11.1,所以可以结合当前版本和官网的说明自己来根据当前业务来定义。
public static void main(String[] args) throws Exception {
EnvironmentSettings settings = EnvironmentSettings.newInstance().inBatchMode().build();
TableEnvironment tableEnv = TableEnvironment.create(settings);
Properties hiveConf = HiveResourceInfo.getHiveConf();
//读取catalog配置
HiveCatalog hive = new HiveCatalog(
hiveConf.getProperty("CATALOG.NAME"),
hiveConf.getProperty("CATALOG.DB"),
hiveConf.getProperty("CATALOG.HIVECONFDIR")
);
tableEnv.registerCatalog("myhive", hive);
tableEnv.useCatalog("myhive");
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
//创建落地表
String hiveSql = "create external table fs_tables (" +
" id string," +
" name String" +
") partitioned by (dt string) " +
"stored as orc " +
"tblproperties (" +
" 'partition.time-extractor.timestamp-pattern'='$dt'," +
" 'sink.partition-commit.delay'='0s'," +
" 'sink.partition-commit.trigger'='partition-time'," +
" 'sink.partition-commit.policy.kind'='metastore'" +
")";
tableEnv.executeSql(hiveSql);
// Table result = table.groupBy($("age")).select($("id"), $("name"), $("age"));
// Table table = tableEnv.sqlQuery("select id,name,age from test.hive_test7 where age > 18");
tableEnv.executeSql("SELECT * FROM Orders");
//落地数据
tableEnv.executeSql("insert into fs_tables select id,name,age from test.hive_test7 where age > 18 ");
配置文件的加载
package flink.java.utils;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Objects;
import java.util.Properties;
public class HiveResourceInfo {
private static HiveResourceInfo resourceInfo=null;
private static Properties prop=null;
private HiveResourceInfo(){}
public static synchronized Properties getHiveConf() throws Exception {
if(null == resourceInfo){
resourceInfo=new HiveResourceInfo();
InputStream inputStream = HiveResourceInfo.class
.getClassLoader()
.getResourceAsStream("hiveInfo/hiveConf.properties");
if (Objects.isNull(inputStream)) {
throw new Exception("can not read connInfo/hive/hiveConf.properties");
}
if(null == prop){
prop=new Properties();
}
pro