文章目录
hudi 支持丰富的 dml 支持,不止包括 insert overwrite 和 insert into,还支持 row-level 的 delete、update、merge into,更多详情见链接: https://hudi.apache.org/docs/quick-start-guide
为方便下面的case讲解,先创建几个表:
-- create a cow table, with default primaryKey 'uuid' and without preCombineField provided
-- 因为该表有uuid字段,Hudi默认存在uuid字段时,即使不指定主键,uuid默认会作为主键;
create table hudi_cow_nonpcf_tbl (
uuid int,
name string,
price double
) using hudi;
-- create a mor non-partitioned table with preCombineField provided
create table hudi_mor_tbl (
id int,
name string,
price double,
ts bigint
) using hudi
tblproperties (
type = 'mor',
primaryKey = 'id',
preCombineField = 'ts'
);
-- create a partitioned, preCombineField-provided cow table
create table hudi_cow_pt_tbl (
id bigint,
name string,
ts bigint,
dt string,
hh string
) using hudi
tblproperties (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt, hh)
location '/tmp/hudi/hudi_cow_pt_tbl’;
1.Insert data(插入数据)
-- insert into non-partitioned table
insert into hudi_cow_nonpcf_tbl select 1, 'a1', 20;
insert into hudi_mor_tbl select 1, 'a1', 20, 1000;
-- insert dynamic partition 插入动态分区
insert into hudi_cow_pt_tbl partition (dt, hh)
select 1 as id, 'a1' as name, 1000 as ts, '2021-12-09' as dt, '10' as hh;
-- insert static partition 插入静态分区
insert into hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='11') select 2, 'a2', 1000;
注意:
默认情况下,如果指定了preCombineKey,那么insert into采用的是upsert的写入模式,否则采用insert模式;说明:upsert 既可以更新数据,又可以插入数据;insert 只能插入数据;对于指定preCombineKey的表,如果插入的数据的主键在表里已经存在,则只更新不新增;
示例:
--upsert mode for preCombineField-provided table(upsert模式写入指定preCombineField的表)
--我们上面在hudi_mor_tbl表里已经插入了一条数据:
select id, name, price, ts from hudi_mor_tbl;
1 a1 20.0 1000
--现在再插入一条:
insert into hudi_mor_tbl select 1, 'a1_1', 20, 1001;
select id, name, price, ts from hudi_mor_tbl;
1 a1_1 20.0 1001
可以看到,对于主键指定primaryKey = ‘id’的hudi_mor_tbl,当再次插入主键相同的一条数据的时候,inert into采用的是upsert的更新模式,并不会新增一条数据
同时支持bulk_insert的写入模式,只需要设置两个参数:hoodie.sql.bulk.insert.enable 和 hoodie.sql.insert.mode.
示例:
-- bulk_insert mode for preCombineField-provided table
set hoodie.sql.bulk.insert.enable=true; --开启批量插入模式
set hoodie.sql.insert.mode=non-strict; --关闭插入去重模式
insert into hudi_mor_tbl select 1, 'a1_2', 20, 1002;
select id, name, price, ts from hudi_mor_tbl;
1 a1_1 20.0 1001
1 a1_2 20.0 1002
从上面查询结果可以看出,虽然hudi_mor_tbl设置了id主键,但是当设置set hoodie.sql.insert.mode=non-strict;时,插入主键相同的一条数据,并没有去重;另外这里的示例没有展示出批量插入的方式,只插入了一条数据;
2.Query data(查询数据)
select fare, begin_lon, begin_lat, ts from hudi_trips_snapshot where fare > 20.0
3.Time Travel Query
对于历史的提交版本,你可以通过指定其提交的时间戳进行访问,环境要求:Spark 3.2+
create table hudi_cow_pt_tbl (
id bigint,
name string,
ts bigint,
dt string,
hh string
) using hudi
tblproperties (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt, hh)
location '/tmp/hudi/hudi_cow_pt_tbl';
insert into hudi_cow_pt_tbl select 1, 'a0', 1000, '2021-12-09', '10';
select * from hudi_cow_pt_tbl;
-- record id=1 changes `name`
insert into hudi_cow_pt_tbl select 1, 'a1', 1001, '2021-12-09', '10';
select * from hudi_cow_pt_tbl;
-- time travel based on first commit time, assume `20220307091628793`
select * from hudi_cow_pt_tbl timestamp as of '20220307091628793' where id = 1;
-- time travel based on different timestamp formats
select * from hudi_cow_pt_tbl timestamp as of '2022-03-07 09:16:28.100' where id = 1;
select * from hudi_cow_pt_tbl timestamp as of '2022-03-08' where id = 1;
时间戳的方式可通过下面的 spark sql 语句查询
注意:如果是自己的客户端,需要先 use db;
call show_commits(table => ‘test_hudi_table’, limit => 10);
4.Update(更新操作)
语法:
UPDATE tableIdentifier SET column = EXPRESSION(,column = EXPRESSION) [ WHERE boolExpression]
示例:
sql update hudi_mor_tbl set price = price * 2, ts = 1111 where id = 1;
update hudi_cow_pt_tbl set name = 'a1_1', ts = 1001 where id = 1;
-- update using non-PK field
update hudi_cow_pt_tbl set ts = 1001 where name = 'a1';
注意:update操作需要声明preCombineField;
5.MergeInto(合并插入)
简单的说就是,判断表中有没有符合on()条件中的数据,有了就更新数据,没有就插入数据;
语法:
MERGE INTO tableIdentifier AS target_alias
USING (sub_query | tableIdentifier) AS source_alias
ON <merge_condition>
[ WHEN MATCHED [ AND <condition> ] THEN <matched_action> ]
[ WHEN NOT MATCHED [ AND <condition> ] THEN <not_matched_action> ]
<merge_condition> =A equal bool condition
<matched_action> =
DELETE |
UPDATE SET * |
UPDATE SET column1 = expression1 [, column2 = expression2 ...]
<not_matched_action> =
INSERT * |
INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])
示例:
-- source table using hudi for testing merging into non-partitioned table
create table merge_source
(
id int,
name string,
price double,
ts bigint
) using hudi
tblproperties
(
primaryKey = 'id',
preCombineField = 'ts'
);
insert into merge_source values (1, "old_a1", 22.22, 900), (2, "new_a2", 33.33, 2000), (3, "new_a3", 44.44, 2000);
merge into hudi_mor_tbl as target
using merge_source as source
on target.id = source.id
when matched then update set *
when not matched then insert *
;
-- source table using parquet for testing merging into partitioned table
create table merge_source2
(
id int,
name string,
flag string,
dt string,
hh string
) using parquet;
insert into merge_source2 values (1, "new_a1", 'update', '2021-12-09', '10'), (2, "new_a2", 'delete', '2021-12-09', '11'), (3, "new_a3", 'insert', '2021-12-09', '12');
merge into hudi_cow_pt_tbl as target
using
(
select id, name, '1000' as ts, flag, dt, hh from merge_source2
) source
on target.id = source.id
when matched and flag != 'delete' then
update set id = source.id, name = source.name, ts = source.ts, dt = source.dt, hh = source.hh
when matched and flag = 'delete' then delete
when not matched then
insert (id, name, ts, dt, hh) values(source.id, source.name, source.ts, source.dt, source.hh)
;
6.Hard Deletes(硬删除)
语法:
DELETE FROM tableIdentifier [ WHERE BOOL_EXPRESSION]
示例:
delete from hudi_cow_nonpcf_tbl where uuid = 1;
delete from hudi_mor_tbl where id % 2 = 0;
-- delete using non-PK field
delete from hudi_cow_pt_tbl where name = 'a1';
7.Insert Overwrite(覆盖写入)
-- insert overwrite non-partitioned table
insert overwrite hudi_mor_tbl select 99, 'a99', 20.0, 900;
insert overwrite hudi_cow_nonpcf_tbl select 99, 'a99', 20.0;
-- insert overwrite partitioned table with dynamic partition
insert overwrite table hudi_cow_pt_tbl select 10, 'a10', 1100, '2021-12-09', '10';
-- insert overwrite partitioned table with static partition
insert overwrite hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='12') select 13, 'a13', 1100;
8.更多Spark SQL命令
8.1Alter table
语法:
-- Alter table name
ALTER TABLE oldTableName RENAME TO newTableName
-- Alter table add columns
ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*)
-- Alter table column type
ALTER TABLE tableIdentifier CHANGE COLUMN colName colName colType
-- Alter table properties
ALTER TABLE tableIdentifier SET TBLPROPERTIES (key = 'value')
示例:
--rename to:
ALTER TABLE hudi_cow_nonpcf_tbl RENAME TO hudi_cow_nonpcf_tbl2;
--add column:
ALTER TABLE hudi_cow_nonpcf_tbl2 add columns(remark string);
--change column:
ALTER TABLE hudi_cow_nonpcf_tbl2 change column uuid uuid bigint;
--set properties;
alter table hudi_cow_nonpcf_tbl2 set tblproperties (hoodie.keep.max.commits = '10');
8.2Partition Sql Command
语法:
-- Drop Partition
ALTER TABLE tableIdentifier DROP PARTITION ( partition_col_name = partition_col_val [ , ... ] )
-- Show Partitions
SHOW PARTITIONS tableIdentifier
示例:
--show partition:
show partitions hudi_cow_pt_tbl;
--drop partition:
alter table hudi_cow_pt_tbl drop partition (dt='2021-12-09', hh='10');