文章目录
Maven配置
<dependencies>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-avro_2.11</artifactId>
<version>2.4.6</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark_2.11</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-bundle_2.11</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.4.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.6</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-reflect</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.19</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.3.4</version>
</dependency>
</dependencies>
数据准备
测试数据
uuid,name,addr,phone,update_date,bir_date
1,逝去的青春,上海市宝山区,183****1111,20200805,20020101
2,葬爱,上海市虹口区,183****2222,20200805,20020101
3,罙罙の回憶,上海市虹口区,183****3333,20200805,20020101
4,忘了天空的颜色,上海市虹口区,183****4444,20200805,20020101
5,李彦龙,上海市松江区,183****5555,20200801,20010101
6,李浩鹏,上海市松江区,183****6666,20200801,20010101
7,李天一,上海市松江区,183****7777,20200801,20010101
8,李朵雯,上海市松江区,183****8888,20200801,20010101
9,李雨杭,上海市松江区,183****9999,20200801,20010101
10,王满,杭州市西湖区,153****0000,20200802,20000101
11,王琳,杭州市西湖区,153****1111,20200802,20000101
12,王昕,杭州市西湖区,153****2222,20200802,20000101
13,贾一一,杭州市西湖区,153****3333,20200802,20000101
14,石浩,西安市莲湖区,137****4444,20200803,19970101
15,石子彤,西安市莲湖区,137****5555,20200803,19970101
16,许放炮的,西安市莲湖区,137****6666,20200803,19970101
建表
把测试数据导入hive
CREATE TABLE `hudi.user_info_demo`(
`uuid` string,
`name` string,
`addr` string,
`phone` string,
`update_date` string,
`bir_date` string
) stored as parquet
测试数据入库
case class UserInfo(uuid: String, name: String, addr: String, phone: String, update_date: String, bir_date: String)
val sparkSession: SparkSession = SparkSession.builder()
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.appName("Hudi Test")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
@Test
def write2hive(): Unit = {
import sparkSession.implicits._
val userInfoDF: DataFrame = sparkSession.sparkContext.textFile("src/main/resources/hive_test/user_info.txt")
.filter(_.nonEmpty)
.map(line => {
val arr: Array[String] = line.split(",")
UserInfo(arr(0), arr(1), arr(2), arr(3), arr(4), arr(5))
}).toDF()
userInfoDF.coalesce(1).createOrReplaceTempView("tmp")
sparkSession.sql("insert into table hudi.user_info_demo select * from tmp")
}
Sync Copy_on_Write
Hudi新增同步Hive
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions}
import org.apache.hudi.config.{HoodieIndexConfig, HoodieWriteConfig}
import org.apache.hudi.index.HoodieIndex
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.junit.Test
import org.apache.spark.sql.functions._
@Test
def sync2Hive(): Unit = {
val sourceDF: DataFrame = sparkSession.sql("select * from hudi.user_info_demo")
sourceDF
.withColumn("bir_date", from_unixtime(unix_timestamp(col("bir_date"), "yyyyMMdd"), "yyyy/MM/dd"))
.write
.format("org.apache.hudi")
// 唯一id列名,可以指定多个字段
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uuid")
// 指定更新字段,该字段数值大的会覆盖小的
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "update_date")
// 指定 partitionpath
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "bir_date")
// 当前数据的分区变更时,数据的分区目录是否变化
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true")
//设置索引类型目前有HBASE,INMEMORY,BLOOM,GLOBAL_BLOOM 四种索引 为了保证分区变更后能找到必须设置全局GLOBAL_BLOOM
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name())
// hudi table名
.option(HoodieWriteConfig.TABLE_NAME, "hudi_hive_sync")
// 设置并行度
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
// 同步hive参数
// hive database
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "hudi")
// hive table
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "hudi_hive_sync")
// 设置数据集注册并同步到hive
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true")
// hive表分区字段
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "bir_year,bir_month,bir_day")
// hiveserver2 地址
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://localhost:10000")
// 从partitionpath中提取hive分区对应的值,MultiPartKeysValueExtractor使用的是"/"分割
// 此处可以自己实现,继承PartitionValueExtractor 重写 extractPartitionValuesInPath(partitionPath: String)方法即可
// 写法可以点进MultiPartKeysValueExtractor类中查看
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, "org.apache.hudi.hive.MultiPartKeysValueExtractor")
.mode(SaveMode.Append)
.save("hdfs://localhost:8020/user/hive/warehouse/hudi.db/hudi_hive_sync")
}
查看写入hive的表结构
CREATE EXTERNAL TABLE `hudi_hive_sync7`(
`_hoodie_commit_time` string,
`_hoodie_commit_seqno` string,
`_hoodie_record_key` string,
`_hoodie_partition_path` string,
`_hoodie_file_name` string,
`uuid` string,
`name` string,
`addr` string,
`phone` string,
`update_date` string,
`bir_date` string)
PARTITIONED BY (
`bir_year` string,
`bir_month` string,
`bir_day` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hudi.hadoop.HoodieParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://localhost:8020/user/hive/warehouse/hudi.db/hudi_hive_sync7'
TBLPROPERTIES (
'bucketing_version'='2',
'last_commit_time_sync'='20200806161512',
'transient_lastDdlTime'='1596700622')
查看写入hive的数据
查看hdfs目录结构
Hudi更新同步Hive
@Test
def update2Hive(): Unit = {
val sourceDF: DataFrame = sparkSession.sql("select * from hudi.user_info_demo")
// 对原始数据做一些修改
val reault = sourceDF
.where(col("bir_date") === "19970101")
.withColumn("bir_date", from_unixtime(unix_timestamp(col("bir_date"), "yyyyMMdd"), "yyyy/MM/dd"))
.withColumn("update_date", lit("20200806"))
.withColumn("name", lit("世界和平"))
reault
.write
.format("org.apache.hudi")
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "uuid")
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "update_date")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "bir_date")
.option(HoodieIndexConfig.BLOOM_INDEX_UPDATE_PARTITION_PATH, "true")
.option(HoodieIndexConfig.INDEX_TYPE_PROP, HoodieIndex.IndexType.GLOBAL_BLOOM.name())
.option(HoodieWriteConfig.TABLE_NAME, "hudi_hive_sync")
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
// 同步hive参数
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "hudi")
.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "hudi_hive_sync")
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true")
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "bir_year,bir_month,bir_day")
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY, "org.apache.hudi.hive.MultiPartKeysValueExtractor")
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://localhost:10000")
.mode(SaveMode.Append)
.save("hdfs://localhost:8020/user/hive/warehouse/hudi.db/hudi_hive_sync")
}
查看更新后的数据
读取Hudi增量视图
@Test
def readIncrementView(): Unit = {
sparkSession.read
.format("org.apache.hudi")
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20200806155654")
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, "20200806161512")
.load("hdfs://localhost:8020/user/hive/warehouse/hudi.db/hudi_hive_sync")
.show(false)
}
增量视图读取结果
+-------------------+--------------------+------------------+----------------------+---------------------------------------------------------------------+----+--------+------------+-----------+-----------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name |uuid|name |addr |phone |update_date|bir_date |
+-------------------+--------------------+------------------+----------------------+---------------------------------------------------------------------+----+--------+------------+-----------+-----------+----------+
|20200806161512 |20200806161512_0_1 |15 |1997/01/01 |34067f9f-3dc5-4716-838d-f8b4129095f9-0_0-22-32_20200806161512.parquet|15 |世界和平|西安市莲湖区|137****5555|20200806 |1997/01/01|
|20200806161512 |20200806161512_0_2 |14 |1997/01/01 |34067f9f-3dc5-4716-838d-f8b4129095f9-0_0-22-32_20200806161512.parquet|14 |世界和平|西安市莲湖区|137****4444|20200806 |1997/01/01|
|20200806161512 |20200806161512_0_3 |16 |1997/01/01 |34067f9f-3dc5-4716-838d-f8b4129095f9-0_0-22-32_20200806161512.parquet|16 |世界和平|西安市莲湖区|137****6666|20200806 |1997/01/01|
+-------------------+--------------------+------------------+----------------------+---------------------------------------------------------------------+----+--------+------------+-----------+-----------+----------+