Hudi基础 -- DML


hudi 支持丰富的 dml 支持,不止包括 insert overwrite 和 insert into,还支持 row-level 的 delete、update、merge into,更多详情见链接: https://hudi.apache.org/docs/quick-start-guide

为方便下面的case讲解,先创建几个表:

-- create a cow table, with default primaryKey 'uuid' and without preCombineField provided
-- 因为该表有uuid字段,Hudi默认存在uuid字段时,即使不指定主键,uuid默认会作为主键;
create table hudi_cow_nonpcf_tbl (
  uuid int,
  name string,
  price double
) using hudi;


-- create a mor non-partitioned table with preCombineField provided
create table hudi_mor_tbl (
  id int,
  name string,
  price double,
  ts bigint
) using hudi
tblproperties (
  type = 'mor',
  primaryKey = 'id',
  preCombineField = 'ts'
);


-- create a partitioned, preCombineField-provided cow table
create table hudi_cow_pt_tbl (
  id bigint,
  name string,
  ts bigint,
  dt string,
  hh string
) using hudi
tblproperties (
  type = 'cow',
  primaryKey = 'id',
  preCombineField = 'ts'
 )
partitioned by (dt, hh)
location '/tmp/hudi/hudi_cow_pt_tbl’;

1.Insert data(插入数据)

-- insert into non-partitioned table
insert into hudi_cow_nonpcf_tbl select 1, 'a1', 20;
insert into hudi_mor_tbl select 1, 'a1', 20, 1000;


-- insert dynamic partition 插入动态分区
insert into hudi_cow_pt_tbl partition (dt, hh)
select 1 as id, 'a1' as name, 1000 as ts, '2021-12-09' as dt, '10' as hh;


-- insert static partition 插入静态分区
insert into hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='11') select 2, 'a2', 1000;

注意:
默认情况下,如果指定了preCombineKey,那么insert into采用的是upsert的写入模式,否则采用insert模式;说明:upsert 既可以更新数据,又可以插入数据;insert 只能插入数据;对于指定preCombineKey的表,如果插入的数据的主键在表里已经存在,则只更新不新增;

示例:

--upsert mode for preCombineField-provided table(upsert模式写入指定preCombineField的表)
--我们上面在hudi_mor_tbl表里已经插入了一条数据:
select id, name, price, ts from hudi_mor_tbl;  
1   a1    20.0    1000

--现在再插入一条:
insert into hudi_mor_tbl select 1, 'a1_1', 20, 1001;
select id, name, price, ts from hudi_mor_tbl;  
1   a1_1    20.0    1001

可以看到,对于主键指定primaryKey = ‘id’的hudi_mor_tbl,当再次插入主键相同的一条数据的时候,inert into采用的是upsert的更新模式,并不会新增一条数据

同时支持bulk_insert的写入模式,只需要设置两个参数:hoodie.sql.bulk.insert.enable 和 hoodie.sql.insert.mode.
示例:

-- bulk_insert mode for preCombineField-provided table
set hoodie.sql.bulk.insert.enable=true;  --开启批量插入模式
set hoodie.sql.insert.mode=non-strict;   --关闭插入去重模式

insert into hudi_mor_tbl select 1, 'a1_2', 20, 1002;
select id, name, price, ts from hudi_mor_tbl;
1   a1_1    20.0    1001
1   a1_2    20.0    1002

从上面查询结果可以看出,虽然hudi_mor_tbl设置了id主键,但是当设置set hoodie.sql.insert.mode=non-strict;时,插入主键相同的一条数据,并没有去重;另外这里的示例没有展示出批量插入的方式,只插入了一条数据;

2.Query data(查询数据)

 select fare, begin_lon, begin_lat, ts from  hudi_trips_snapshot where fare > 20.0

3.Time Travel Query

对于历史的提交版本,你可以通过指定其提交的时间戳进行访问,环境要求:Spark 3.2+

create table hudi_cow_pt_tbl (
  id bigint,
  name string,
  ts bigint,
  dt string,
  hh string
) using hudi
tblproperties (
  type = 'cow',
  primaryKey = 'id',
  preCombineField = 'ts'
 )
partitioned by (dt, hh)
location '/tmp/hudi/hudi_cow_pt_tbl';


insert into hudi_cow_pt_tbl select 1, 'a0', 1000, '2021-12-09', '10';
select * from hudi_cow_pt_tbl;


-- record id=1 changes `name`
insert into hudi_cow_pt_tbl select 1, 'a1', 1001, '2021-12-09', '10';
select * from hudi_cow_pt_tbl;


-- time travel based on first commit time, assume `20220307091628793`
select * from hudi_cow_pt_tbl timestamp as of '20220307091628793' where id = 1;
-- time travel based on different timestamp formats
select * from hudi_cow_pt_tbl timestamp as of '2022-03-07 09:16:28.100' where id = 1;
select * from hudi_cow_pt_tbl timestamp as of '2022-03-08' where id = 1;

时间戳的方式可通过下面的 spark sql 语句查询
注意:如果是自己的客户端,需要先 use db;
call show_commits(table => ‘test_hudi_table’, limit => 10);

4.Update(更新操作)

语法:

UPDATE tableIdentifier SET column = EXPRESSION(,column = EXPRESSION) [ WHERE boolExpression]

示例:

sql update hudi_mor_tbl set price = price * 2, ts = 1111 where id = 1;


update hudi_cow_pt_tbl set name = 'a1_1', ts = 1001 where id = 1;


-- update using non-PK field
update hudi_cow_pt_tbl set ts = 1001 where name = 'a1';

注意:update操作需要声明preCombineField;

5.MergeInto(合并插入)

简单的说就是,判断表中有没有符合on()条件中的数据,有了就更新数据,没有就插入数据;
语法:

MERGE INTO tableIdentifier AS target_alias
USING (sub_query | tableIdentifier) AS source_alias
ON <merge_condition>
[ WHEN MATCHED [ AND <condition> ] THEN <matched_action> ]
[ WHEN NOT MATCHED [ AND <condition> ]  THEN <not_matched_action> ]


<merge_condition> =A equal bool condition 
<matched_action>  =
  DELETE  |
  UPDATE SET *  |
  UPDATE SET column1 = expression1 [, column2 = expression2 ...]
<not_matched_action>  =
  INSERT *  |
  INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])

示例:


-- source table using hudi for testing merging into non-partitioned table
create table merge_source 
(
  id int, 
  name string, 
  price double, 
  ts bigint
) using hudi
tblproperties 
(
  primaryKey = 'id', 
  preCombineField = 'ts'
);
insert into merge_source values (1, "old_a1", 22.22, 900), (2, "new_a2", 33.33, 2000), (3, "new_a3", 44.44, 2000);


merge into hudi_mor_tbl as target
using merge_source as source
on target.id = source.id
when matched then update set *
when not matched then insert *
;


-- source table using parquet for testing merging into partitioned table
create table merge_source2 
(
  id int, 
  name string, 
  flag string, 
  dt string, 
  hh string
) using parquet;
insert into merge_source2 values (1, "new_a1", 'update', '2021-12-09', '10'), (2, "new_a2", 'delete', '2021-12-09', '11'), (3, "new_a3", 'insert', '2021-12-09', '12');


merge into hudi_cow_pt_tbl as target
using 
(
  select id, name, '1000' as ts, flag, dt, hh from merge_source2
) source
on target.id = source.id
when matched and flag != 'delete' then
 update set id = source.id, name = source.name, ts = source.ts, dt = source.dt, hh = source.hh
when matched and flag = 'delete' then delete
when not matched then
 insert (id, name, ts, dt, hh) values(source.id, source.name, source.ts, source.dt, source.hh)
;

6.Hard Deletes(硬删除)

语法:

DELETE FROM tableIdentifier [ WHERE BOOL_EXPRESSION]

示例:

delete from hudi_cow_nonpcf_tbl where uuid = 1;

delete from hudi_mor_tbl where id % 2 = 0;

-- delete using non-PK field
delete from hudi_cow_pt_tbl where name = 'a1';

7.Insert Overwrite(覆盖写入)

-- insert overwrite non-partitioned table
insert overwrite hudi_mor_tbl select 99, 'a99', 20.0, 900;
insert overwrite hudi_cow_nonpcf_tbl select 99, 'a99', 20.0;


-- insert overwrite partitioned table with dynamic partition
insert overwrite table hudi_cow_pt_tbl select 10, 'a10', 1100, '2021-12-09', '10';

-- insert overwrite partitioned table with static partition
insert overwrite hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='12') select 13, 'a13', 1100;

8.更多Spark SQL命令

8.1Alter table

语法:

-- Alter table name
ALTER TABLE oldTableName RENAME TO newTableName


-- Alter table add columns
ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*)


-- Alter table column type
ALTER TABLE tableIdentifier CHANGE COLUMN colName colName colType

-- Alter table properties
ALTER TABLE tableIdentifier SET TBLPROPERTIES (key = 'value')

示例:

--rename to:
ALTER TABLE hudi_cow_nonpcf_tbl RENAME TO hudi_cow_nonpcf_tbl2;

--add column:
ALTER TABLE hudi_cow_nonpcf_tbl2 add columns(remark string);

--change column:
ALTER TABLE hudi_cow_nonpcf_tbl2 change column uuid uuid bigint;


--set properties;
alter table hudi_cow_nonpcf_tbl2 set tblproperties (hoodie.keep.max.commits = '10');
8.2Partition Sql Command

语法:

-- Drop Partition
ALTER TABLE tableIdentifier DROP PARTITION ( partition_col_name = partition_col_val [ , ... ] )


-- Show Partitions
SHOW PARTITIONS tableIdentifier

示例:

--show partition:
show partitions hudi_cow_pt_tbl;


--drop partition:
alter table hudi_cow_pt_tbl drop partition (dt='2021-12-09', hh='10');
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值