问题复现
spark-sql --queue xxx--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
构造数据
drop table hudi_test;
create table h2 (
id int,
name string,
price double,
ts long,
dt string
) using hudi
partitioned by (dt)
options (
primaryKey = 'id',
type = 'cow'
);
insert into hudi_test select 1 as id, 'hudi1' as name, 97 as price, 99 as ts, '2021-05-05' as dt;
insert into hudi_test select 2 as id, 'hudi2' as name, 97 as price, 99 as ts, '2021-05-05' as dt;
删除正常
spark-sql> delete from hudi_test where id = 1;
22/09/03 22:32:07 WARN SparkSession$Builder: Using an existing SparkSession; some spark core configurations may not take effect.
14:43 WARN: Timeline-server-based markers are not supported for HDFS: base path hdfs://R2/projects/xxx/hive/dev_db/hudi_test. Falling back to direct markers.
14:49 WARN: Timeline-server-based markers are not supported for HDFS: base path hdfs://R2/projects/xxx/hive/dev_db/hudi_test. Falling back to direct markers.
14:49 WARN: Timeline-server-based markers are not supported for HDFS: base path hdfs://R2/projects/xxx/hive/dev_db/hudi_test. Falling back to direct markers.
Time taken: 7.776 seconds
更新异常
update hudi_test set name = 'leon' where id = 2;
异常日志
Caused by: org.apache.hudi.exception.HoodieException: (Part -) field not found in record. Acceptable fields were :[id, name, price, ts, dt]
at org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldVal(HoodieAvroUtils.java:485)
原因分析
没有指定preCombineField,可以删除,但无法更新。
个人感觉hudi还有比较多细节问题没解决,中英文档还不够丰富。
解决方案
建表时指定preCombineField,如下
drop table hudi_test;
create table hudi_test (
id int,
name string,
price double,
ts long,
dt string
) using hudi
partitioned by (dt)
options (
primaryKey = 'id',
preCombineField = 'ts',
type = 'cow'
);