Hudi基础 -- DML

最新推荐文章于 2024-08-27 14:32:05 发布

墨砚

最新推荐文章于 2024-08-27 14:32:05 发布

阅读量946

点赞数

分类专栏： Hudi 文章标签： Hudi 数据湖

本文链接：https://blog.csdn.net/At_Once/article/details/128962558

版权

Hudi 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

文章目录

hudi 支持丰富的 dml 支持，不止包括 insert overwrite 和 insert into，还支持 row-level 的 delete、update、merge into，更多详情见链接: https://hudi.apache.org/docs/quick-start-guide

为方便下面的case讲解，先创建几个表：

-- create a cow table, with default primaryKey 'uuid' and without preCombineField provided
-- 因为该表有uuid字段，Hudi默认存在uuid字段时，即使不指定主键，uuid默认会作为主键；
create table hudi_cow_nonpcf_tbl (
  uuid int,
  name string,
  price double
) using hudi;


-- create a mor non-partitioned table with preCombineField provided
create table hudi_mor_tbl (
  id int,
  name string,
  price double,
  ts bigint
) using hudi
tblproperties (
  type = 'mor',
  primaryKey = 'id',
  preCombineField = 'ts'
);


-- create a partitioned, preCombineField-provided cow table
create table hudi_cow_pt_tbl (
  id bigint,
  name string,
  ts bigint,
  dt string,
  hh string
) using hudi
tblproperties (
  type = 'cow',
  primaryKey = 'id',
  preCombineField = 'ts'
 )
partitioned by (dt, hh)
location '/tmp/hudi/hudi_cow_pt_tbl’;

1.Insert data（插入数据）

-- insert into non-partitioned table
insert into hudi_cow_nonpcf_tbl select 1, 'a1', 20;
insert into hudi_mor_tbl select 1, 'a1', 20, 1000;


-- insert dynamic partition 插入动态分区
insert into hudi_cow_pt_tbl partition (dt, hh)
select 1 as id, 'a1' as name, 1000 as ts, '2021-12-09' as dt, '10' as hh;


-- insert static partition 插入静态分区
insert into hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='11') select 2, 'a2', 1000;

注意：
默认情况下，如果指定了preCombineKey，那么insert into采用的是upsert的写入模式，否则采用insert模式；说明：upsert 既可以更新数据，又可以插入数据；insert 只能插入数据；对于指定preCombineKey的表，如果插入的数据的主键在表里已经存在，则只更新不新增；

示例：

--upsert mode for preCombineField-provided table（upsert模式写入指定preCombineField的表）
--我们上面在hudi_mor_tbl表里已经插入了一条数据：
select id, name, price, ts from hudi_mor_tbl;  
1   a1    20.0    1000

--现在再插入一条：
insert into hudi_mor_tbl select 1, 'a1_1', 20, 1001;
select id, name, price, ts from hudi_mor_tbl;  
1   a1_1    20.0    1001

可以看到，对于主键指定primaryKey = ‘id’的hudi_mor_tbl，当再次插入主键相同的一条数据的时候，inert into采用的是upsert的更新模式，并不会新增一条数据

同时支持bulk_insert的写入模式，只需要设置两个参数：hoodie.sql.bulk.insert.enable 和 hoodie.sql.insert.mode.
示例：

-- bulk_insert mode for preCombineField-provided table
set hoodie.sql.bulk.insert.enable=true;  --开启批量插入模式
set hoodie.sql.insert.mode=non-strict;   --关闭插入去重模式

insert into hudi_mor_tbl select 1, 'a1_2', 20, 1002;
select id, name, price, ts from hudi_mor_tbl;
1   a1_1    20.0    1001
1   a1_2    20.0    1002

从上面查询结果可以看出，虽然hudi_mor_tbl设置了id主键，但是当设置set hoodie.sql.insert.mode=non-strict;时，插入主键相同的一条数据，并没有去重；另外这里的示例没有展示出批量插入的方式，只插入了一条数据；

2.Query data（查询数据）

 select fare, begin_lon, begin_lat, ts from  hudi_trips_snapshot where fare > 20.0

3.Time Travel Query

对于历史的提交版本，你可以通过指定其提交的时间戳进行访问，环境要求：Spark 3.2+

create table hudi_cow_pt_tbl (
  id bigint,
  name string,
  ts bigint,
  dt string,
  hh string
) using hudi
tblproperties (
  type = 'cow',
  primaryKey = 'id',
  preCombineField = 'ts'
 )
partitioned by (dt, hh)
location '/tmp/hudi/hudi_cow_pt_tbl';


insert into hudi_cow_pt_tbl select 1, 'a0', 1000, '2021-12-09', '10';
select * from hudi_cow_pt_tbl;


-- record id=1 changes `name`
insert into hudi_cow_pt_tbl select 1, 'a1', 1001, '2021-12-09', '10';
select * from hudi_cow_pt_tbl;


-- time travel based on first commit time, assume `20220307091628793`
select * from hudi_cow_pt_tbl timestamp as of '20220307091628793' where id = 1;
-- time travel based on different timestamp formats
select * from hudi_cow_pt_tbl timestamp as of '2022-03-07 09:16:28.100' where id = 1;
select * from hudi_cow_pt_tbl timestamp as of '2022-03-08' where id = 1;

时间戳的方式可通过下面的 spark sql 语句查询
注意：如果是自己的客户端，需要先 use db;
call show_commits(table => ‘test_hudi_table’, limit => 10);

4.Update（更新操作）

语法：

UPDATE tableIdentifier SET column = EXPRESSION(,column = EXPRESSION) [ WHERE boolExpression]

示例：

sql update hudi_mor_tbl set price = price * 2, ts = 1111 where id = 1;


update hudi_cow_pt_tbl set name = 'a1_1', ts = 1001 where id = 1;


-- update using non-PK field
update hudi_cow_pt_tbl set ts = 1001 where name = 'a1';

注意：update操作需要声明preCombineField；

5.MergeInto（合并插入）

简单的说就是，判断表中有没有符合on（）条件中的数据，有了就更新数据，没有就插入数据；
语法：

MERGE INTO tableIdentifier AS target_alias
USING (sub_query | tableIdentifier) AS source_alias
ON <merge_condition>
[ WHEN MATCHED [ AND <condition> ] THEN <matched_action> ]
[ WHEN NOT MATCHED [ AND <condition> ]  THEN <not_matched_action> ]


<merge_condition> =A equal bool condition 
<matched_action>  =
  DELETE  |
  UPDATE SET *  |
  UPDATE SET column1 = expression1 [, column2 = expression2 ...]
<not_matched_action>  =
  INSERT *  |
  INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])

示例：


-- source table using hudi for testing merging into non-partitioned table
create table merge_source 
(
  id int, 
  name string, 
  price double, 
  ts bigint
) using hudi
tblproperties 
(
  primaryKey = 'id', 
  preCombineField = 'ts'
);
insert into merge_source values (1, "old_a1", 22.22, 900), (2, "new_a2", 33.33, 2000), (3, "new_a3", 44.44, 2000);


merge into hudi_mor_tbl as target
using merge_source as source
on target.id = source.id
when matched then update set *
when not matched then insert *
;


-- source table using parquet for testing merging into partitioned table
create table merge_source2 
(
  id int, 
  name string, 
  flag string, 
  dt string, 
  hh string
) using parquet;
insert into merge_source2 values (1, "new_a1", 'update', '2021-12-09', '10'), (2, "new_a2", 'delete', '2021-12-09', '11'), (3, "new_a3", 'insert', '2021-12-09', '12');


merge into hudi_cow_pt_tbl as target
using 
(
  select id, name, '1000' as ts, flag, dt, hh from merge_source2
) source
on target.id = source.id
when matched and flag != 'delete' then
 update set id = source.id, name = source.name, ts = source.ts, dt = source.dt, hh = source.hh
when matched and flag = 'delete' then delete
when not matched then
 insert (id, name, ts, dt, hh) values(source.id, source.name, source.ts, source.dt, source.hh)
;

6.Hard Deletes（硬删除）

语法：

DELETE FROM tableIdentifier [ WHERE BOOL_EXPRESSION]

示例：

delete from hudi_cow_nonpcf_tbl where uuid = 1;

delete from hudi_mor_tbl where id % 2 = 0;

-- delete using non-PK field
delete from hudi_cow_pt_tbl where name = 'a1';

7.Insert Overwrite（覆盖写入）

-- insert overwrite non-partitioned table
insert overwrite hudi_mor_tbl select 99, 'a99', 20.0, 900;
insert overwrite hudi_cow_nonpcf_tbl select 99, 'a99', 20.0;


-- insert overwrite partitioned table with dynamic partition
insert overwrite table hudi_cow_pt_tbl select 10, 'a10', 1100, '2021-12-09', '10';

-- insert overwrite partitioned table with static partition
insert overwrite hudi_cow_pt_tbl partition(dt = '2021-12-09', hh='12') select 13, 'a13', 1100;

8.更多Spark SQL命令

8.1Alter table

语法：

-- Alter table name
ALTER TABLE oldTableName RENAME TO newTableName


-- Alter table add columns
ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*)


-- Alter table column type
ALTER TABLE tableIdentifier CHANGE COLUMN colName colName colType

-- Alter table properties
ALTER TABLE tableIdentifier SET TBLPROPERTIES (key = 'value')

示例：

--rename to:
ALTER TABLE hudi_cow_nonpcf_tbl RENAME TO hudi_cow_nonpcf_tbl2;

--add column:
ALTER TABLE hudi_cow_nonpcf_tbl2 add columns(remark string);

--change column:
ALTER TABLE hudi_cow_nonpcf_tbl2 change column uuid uuid bigint;


--set properties;
alter table hudi_cow_nonpcf_tbl2 set tblproperties (hoodie.keep.max.commits = '10');

8.2Partition Sql Command

语法：

-- Drop Partition
ALTER TABLE tableIdentifier DROP PARTITION ( partition_col_name = partition_col_val [ , ... ] )


-- Show Partitions
SHOW PARTITIONS tableIdentifier

示例：

--show partition:
show partitions hudi_cow_pt_tbl;


--drop partition：
alter table hudi_cow_pt_tbl drop partition (dt='2021-12-09', hh='10');

墨砚

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录