Merge_On_Read模式操作
插入数据
merge on read 主要是要是加入option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY,DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)参数。
package com.hudi
import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig}
import org.apache.hudi.index.HoodieIndex
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.hj.hudi.Util
object insertPartitionMergeOnRead {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("hudi insert").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[3]").getOrCreate()
// 读取文本文件转换为df
val insertData = Util.readFromTxtByLineToDf(spark, "E:\\Demo\\hudi-test-master\\src\\main\\resources\\test_insert_data.txt")
insertData.write.format("org.apache.hudi")
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
// 设置主键列名
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "rowkey")
// 设置数据更新时间的列名
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "lastupdatedttm")
// 设置分区列
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "dt")
// 设置当分区变更时,当前数据的分区目录是否变更
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true")
// 设置索引类型目前有HBASE,INMEMORY,BLOOM,GLOBAL_BLOOM 四种索引 为了保证分区变更后能找到必须设置全局GLOBAL_BLOOM
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name())
// 并行度参数设置
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
.option(HoodieWriteConfig.TABLE_NAME, "test_partition_merge_on_read")
.mode(SaveMode.Overwrite)
.save("/tmp/hudi_merge_on_read")
}
}
导入Hive
package com.hudi
import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig}
import org.apache.hudi.hive.HiveSyncTool
import org.apache.hudi.index.HoodieIndex
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.hj.hudi.Util
object hiveSyncMergeOnRead {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("delta hiveSync").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[3]").getOrCreate()
val upsertData = Util.readFromTxtByLineToDf(spark, "E:\\Demo\\hudi-test-master\\src\\main\\resources\\hive_sync.txt")
upsertData.write.format("org.apache.hudi")
// 配置读时合并
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
// 设置主键列名
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "rowkey")
// 设置数据更新时间的列名
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "lastupdatedttm")
// 分区列设置
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "dt")
// 设置要同步的hive库名
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "hudi")
// 设置要同步的hive表名
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "test_partition_merge_on_read")
// 设置数据集注册并同步到hive
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true")
// 设置当分区变更时,当前数据的分区目录是否变更
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true")
// 设置要同步的分区列名
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "dt")
// 设置jdbc 连接同步
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://192.168.110.110:10000")
// hudi表名称设置
.option(HoodieWriteConfig.TABLE_NAME, "test_partition_merge_on_read")
// 用于将分区字段值提取到Hive分区列中的类,这里我选择使用当前分区的值同步
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, "org.apache.hudi.hive.MultiPartKeysValueExtractor")
// 设置索引类型目前有HBASE,INMEMORY,BLOOM,GLOBAL_BLOOM 四种索引 为了保证分区变更后能找到必须设置全局GLOBAL_BLOOM
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name())
// 并行度参数设置
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
.mode(SaveMode.Append)
.save("/tmp/hive_merge_on_read");
}
def hiveSyncMergeOnReadByUtil(): Unit = {
val args: Array[String] = Array("--jdbc-url", "jdbc:hive2://192.168.110.110:10000", "--partition-value-extractor", "org.apache.hudi.hive.MultiPartKeysValueExtractor", "--user", "hive", "--pass", "hive", "--partitioned-by", "dt", "--base-path", "/tmp/hudi_merge_on_read", "--database", "hudi", "--table", "test_partition_merge_on_read")
HiveSyncTool.main(args)
}
}
Hive查询优化视图
package com.hudi
import java.sql.DriverManager
import java.util.Properties
object mergeOnReadReadoptimizedViewByHive {
def main(args: Array[String]): Unit = {
// 目标表
val sourceTable = "test_partition_merge_on_read_ro"
Class.forName("org.apache.hive.jdbc.HiveDriver")
val prop = new Properties()
prop.put("user", "hive")
prop.put("password", "hive")
val conn = DriverManager.getConnection("jdbc:hive2://192.168.110.110:10000/hudi", prop)
val stmt = conn.createStatement
val rs = stmt.executeQuery("select * from " + sourceTable)
val metaData = rs.getMetaData
val count = metaData.getColumnCount
while (rs.next()) {
for (i <- 1 to count) {
println(metaData.getColumnName(i) + ":" + rs.getObject(i).toString)
}
println("-----------------------------------------------------------")
}
rs.close()
stmt.close()
conn.close()
}
}
Hive增量视图
package com.hudi
import java.sql.DriverManager
import java.util.Properties
object mergeOnReadRealtimeViewByHive {
def main(args: Array[String]): Unit = {
// 目标表
val sourceTable = "test_partition_merge_on_read_rt"
Class.forName("org.apache.hive.jdbc.HiveDriver")
val prop = new Properties()
prop.put("user", "hive")
prop.put("password", "hive")
val conn = DriverManager.getConnection("jdbc:hive2://192.168.110.110:10000/hudi", prop)
val stmt = conn.createStatement
val rs = stmt.executeQuery("select * from " + sourceTable)
val metaData = rs.getMetaData
val count = metaData.getColumnCount
while (rs.next()) {
for (i <- 1 to count) {
println(metaData.getColumnName(i) + ":" + rs.getObject(i).toString)
}
println("-----------------------------------------------------------")
}
rs.close()
stmt.close()
conn.close()
}
}