Hudi To Hive

Hudi To Hive

首先把hudi-hadoop-mr拷贝到hive的lib目录下。

cp packaging/hudi-hadoop-mr-bundle/target/hudi-hadoop-mr-bundle-0.5.3-sources.jar /opt/hdk/hive/lib/

[root@ha1 /]# scp /opt/hdk/hive/lib/hudi-hadoop-mr-bundle-0.5.3-sources.jar root@ha2:/opt/hdk/hive/lib/
hudi-hadoop-mr-bundle-0.5.3-sources.jar                                                        100% 1014KB  80.6MB/s   00:00    
[root@ha1 /]# scp /opt/hdk/hive/lib/hudi-hadoop-mr-bundle-0.5.3-sources.jar root@ha3:/opt/hdk/hive/lib/
hudi-hadoop-mr-bundle-0.5.3-sources.jar                                                        100% 1014KB  91.8MB/s   00:00

// 启动服务
nohup hive --service metastore >metasotre.log>&1 &
nohup hive --service hiveserver2 >hiveserver2.log >&1 &

导入Hive

package com.hudi

import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions}
import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig}
import org.apache.hudi.hive.HiveSyncTool
import org.apache.hudi.index.HoodieIndex
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.hj.hudi.Util

object HudiToHive {
    def main(args: Array[String]): Unit = {
        val spark = SparkSession.builder.appName("delta hiveSync").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[3]").getOrCreate()
        val upsertData = Util.readFromTxtByLineToDf(spark, "E:\\Demo\\hudi-test-master\\src\\main\\resources\\hive_sync.txt")

        upsertData.write.format("org.apache.hudi")
            // 设置主键列名
            .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "rowkey")
            // 设置数据更新时间的列名
            .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "lastupdatedttm")
            // 分区列设置
            .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "dt")
            // 设置要同步的hive库名
            .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "hudi")
            // 设置要同步的hive表名
            .option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "test_hive_hudi")
            // 设置数据集注册并同步到hive
            .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true")
            // 设置当分区变更时,当前数据的分区目录是否变更
            .option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true")
            // 设置要同步的分区列名
            .option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "dt")
            // 设置jdbc 连接同步
            .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://192.168.110.110:10000")
            // hudi表名称设置
            .option(HoodieWriteConfig.TABLE_NAME, "test_partition")
            // 用于将分区字段值提取到Hive分区列中的类,这里我选择使用当前分区的值同步
            .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, "org.apache.hudi.hive.MultiPartKeysValueExtractor")
            // 设置索引类型目前有HBASE,INMEMORY,BLOOM,GLOBAL_BLOOM 四种索引 为了保证分区变更后能找到必须设置全局GLOBAL_BLOOM
            .option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name())
            // 并行度参数设置
            .option("hoodie.insert.shuffle.parallelism", "2")
            .option("hoodie.upsert.shuffle.parallelism", "2")
            .mode(SaveMode.Append)
            .save("/tmp/hudi");
    }

    def hiveSyncMergeOnReadByUtil(): Unit = {
        val args: Array[String] = Array("--jdbc-url", "jdbc:hive2://192.168.110.110:10000", "--partition-value-extractor", "org.apache.hudi.hive.MultiPartKeysValueExtractor", "--user", "hive", "--pass", "hive", "--partitioned-by", "dt", "--base-path", "/tmp/hive_hudi", "--database", "hudi", "--table", "test_hive_hudi")
        HiveSyncTool.main(args)
    }
}

Hive查询优化视图

package com.hudi

import java.sql.DriverManager
import java.util.Properties

object hiveViewRead {
    def main(args: Array[String]): Unit = {
        // 目标表
        val sourceTable = "test_partition"
        // 增量视图开始时间点
        val fromCommitTime = "20200220094506"
        // 获取当前增量视图后几个提交批次
        val maxCommits = "2"

        Class.forName("org.apache.hive.jdbc.HiveDriver")
        val prop = new Properties()
        prop.put("user", "hive")
        prop.put("password", "hive")
        val conn = DriverManager.getConnection("jdbc:hive2://192.168.110.110:10000/hudi", prop)
        val stmt = conn.createStatement
        // Allow queries without partition predicate
        stmt.execute("set hive.strict.checks.large.query=false")
        // Dont gather stats for the table created
        stmt.execute("set hive.stats.autogather=false")
        // Set the hoodie modie
        stmt.execute("set hoodie." + sourceTable + ".consume.mode=INCREMENTAL")
        // Set the from commit time
        stmt.execute("set hoodie." + sourceTable + ".consume.start.timestamp=" + fromCommitTime)
        // Set number of commits to pull
        stmt.execute("set hoodie." + sourceTable + ".consume.max.commits=" + maxCommits)

        val rs = stmt.executeQuery("select * from " + sourceTable)
        val metaData = rs.getMetaData
        val count = metaData.getColumnCount


        while (rs.next()) {
            for (i <- 1 to count) {
                println(metaData.getColumnName(i) + ":" + rs.getObject(i).toString)
            }
            println("-----------------------------------------------------------")
        }

        rs.close()
        stmt.close()
        conn.close()
    }
}

Hive增量视图

package com.hudi

import java.sql.DriverManager
import java.util.Properties

object hiveViewRead {
    def main(args: Array[String]): Unit = {
        // 目标表
        val sourceTable = "test_partition"
        // 增量视图开始时间点
        val fromCommitTime = "20200220094506"
        // 获取当前增量视图后几个提交批次
        val maxCommits = "2"

        Class.forName("org.apache.hive.jdbc.HiveDriver")
        val prop = new Properties()
        prop.put("user", "hive")
        prop.put("password", "hive")
        val conn = DriverManager.getConnection("jdbc:hive2://192.168.110.110:10000/hudi", prop)
        val stmt = conn.createStatement
        // 这里设置增量视图参数
        stmt.execute("set hive.input.format=org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat")
        // Allow queries without partition predicate
        stmt.execute("set hive.strict.checks.large.query=false")
        // Dont gather stats for the table created
        stmt.execute("set hive.stats.autogather=false")
        // Set the hoodie modie
        stmt.execute("set hoodie." + sourceTable + ".consume.mode=INCREMENTAL")
        // Set the from commit time
        stmt.execute("set hoodie." + sourceTable + ".consume.start.timestamp=" + fromCommitTime)
        // Set number of commits to pull
        stmt.execute("set hoodie." + sourceTable + ".consume.max.commits=" + maxCommits)

        val rs = stmt.executeQuery("select * from " + sourceTable)
        val metaData = rs.getMetaData
        val count = metaData.getColumnCount


        while (rs.next()) {
            for (i <- 1 to count) {
                println(metaData.getColumnName(i) + ":" + rs.getObject(i).toString)
            }
            println("-----------------------------------------------------------")
        }

        rs.close()
        stmt.close()
        conn.close()
    }
}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

寒 暄

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值