Hudi SQL语法

最新推荐文章于 2024-08-11 17:20:51 发布

王旭亮_

最新推荐文章于 2024-08-11 17:20:51 发布

阅读量1k

点赞数 18

分类专栏：大数据技术栈文章标签： sql 数据库大数据 hive hdfs big data spark

本文链接：https://blog.csdn.net/IT_6969/article/details/135364762

版权

大数据技术栈专栏收录该内容

17 篇文章 0 订阅

订阅专栏

CREATE TABLE

CREATE TABLE命令通过指定带有表属性的字段列表来创建Hudi Table。
格式：

CREATE TABLE [ IF NOT EXISTS] [database_name.]table_name
[ (columnTypeList)]
USING hudi
[ COMMENT table_comment ]
[ LOCATION location_path ]
[ OPTIONS (options_list) ]

创建非分区表
创建一个cow内部表

create table if not exists hudi_table0 (
id int,
name string,
price double
)  using hudi
options (type = 'cow',
primaryKey = 'id'
);

创建一个mor外部表

create table if not exists hudi_table1 (
id int,
name string,
price double,
ts bigint
)  using hudi
location '/tmp/hudi/hudi_table1'
options (
type = 'mor',
primaryKey = 'id,name',
preCombineField = 'ts'
);

创建分区表

create table if not exists hudi_table_p0 (
id bigint,
name string,
ts bigint,
dt string,
hh string
)  using hudi
location '/tmp/hudi/hudi_table_p0'
options (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt, hh);

以SQL方式创建一个hudi表的外表，与spark-shell or deltastreamer方式创建的hudi外表相同

create table h_p1
using hudi
options (
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt)
location '/path/to/hudi';
创建表指定表属性
create table if not exists h3(
id bigint,
name string,
price double
) using hudi
options (
primaryKey = 'id',
type = 'mor',
hoodie.cleaner.fileversions.retained = '20',
hoodie.keep.max.commits = '20'
);

备注：Hudi当前不支持使用char、varchar、tinyint、smallint类型，建议使用string或int类型。

CREATE TABLE AS SELECT

CREATE TABLE As SELECT命令通过指定带有表属性的字段列表来创建Hudi Table。
格式：

CREATE TABLE [ IF NOT EXISTS] [database_name.]table_name
USING hudi
[ COMMENT table_comment ]
[ LOCATION location_path ]
[ OPTIONS (options_list) ]
[ AS query_statement ]

创建分区表

create table h2 using hudi
options (type = 'cow', primaryKey = 'id')
partitioned by (dt)
as
select 1 as id, 'a1' as name, 10 as price, 1000 as dt;

创建非分区表

create table h3 using hudi
as
select 1 as id, 'a1' as name, 10 as price;

从parquet表加载数据到hudi表
创建parquet表

create table parquet_mngd using parquet options(path=’hdfs:///tmp/parquet_dataset/*.parquet’);

CTAS创建hudi表

create table hudi_tbl using hudi location 'hdfs:///tmp/hudi/hudi_tbl/' options (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (datestr) as select * from parquet_mngd;

备注：为了更好的加载数据性能，CTAS使用bulk insert作为写入方式。

DROP TABLE

格式：

DROP TABLE [IF EXISTS] [db_name.]table_name;

删除表

DROP TABLE IF EXISTS hudidb.h1;

SHOW TABLE

展示当前库中所有表

SHOW TABLES IN hudidb;

ALTER RENAME TABLE

重命名表

ALTER TABLE oldTableName RENAME TO newTableName

alter table h0 rename to h0_1;

ALTER ADD COLUMNS

添加新列

ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*)

alter table h0_1 add columns(ext0 string);

TRUNCATE TABLE

清空表数据

TRUNCATE TABLE tableIdentifier

truncate table h0_1;

INSERT INTO

Hudi对于设置了主键的表支持三种Insert模式，默认为upsert。

1、trict模式，Insert 语句将保留 COW 表的主键唯一性约束，不允许重复记录。如果在插入过程中已经存在记录，则会为 COW 表抛出异常，对于MOR表，该模式与upsert模式行为一致。
2、non-strict模式，对主键表采用insert处理。
3、upsert模式，对于主键表的重复值进行更新操作。

insert into h0 select 1, 'a1', 20;

-- insert static partition
insert into h_p0 partition(dt = '2021-01-02') select 1, 'a1';

-- insert dynamic partition
insert into h_p0 select 1, 'a1', dt;

-- insert dynamic partition
insert into h_p1 select 1 as id, 'a1', '2021-01-03' as dt, '19' as hh;

-- insert overwrite table
insert overwrite table h0 select 1, 'a1', 20;

-- insert overwrite table with static partition
insert overwrite h_p0 partition(dt = '2021-01-02') select 1, 'a1';

-- insert overwrite table with dynamic partition
insert overwrite table h_p1 select 2 as id, 'a2', '2021-01-03' as dt, '19' as hh;

MERGE INTO

通过MERGE INTO命令，根据一张表或子查询的连接条件对另外一张表进行查询，连接条件匹配上的进行UPDATE或DELETE，无法匹配的执行INSERT。这个语法仅需要一次全表扫描就完成了全部同步工作，执行效率要高于INSERT＋UPDATE。

格式：

MERGE INTO tableIdentifier AS target_alias

USING (sub_query | tableIdentifier) AS source_alias

ON <merge_condition>

[ WHEN MATCHED [ AND <condition> ] THEN <matched_action> ]

[ WHEN MATCHED [ AND <condition> ] THEN <matched_action> ]

[ WHEN NOT MATCHED [ AND <condition> ] THEN <not_matched_action> ]


<merge_condition> =A equal bool condition

<matched_action> =

DELETE |

UPDATE SET * |

UPDATE SET column1 = expression1 [, column2 = expression2 ...]

<not_matched_action> =

INSERT * |

INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])

1.merge-on condition当前只支持主键列。
2.当前仅支持对COW表进行部分字段的更新。

merge into h0 using s0
on h0.id = s0.id
when matched then update set h0.id = s0.id,price = s0.price * 2

3.当前仅支持对COW表进行更新时，目标表的字段出现在更新表达式的右值。

merge into h0 using s0 on h0.id = s0.id
when matched then update set id = s0.id,
name = h0.name,
price = s0.price + h0.price

4.如果针对部分字段进行更新，更新值必须包含主键列：Hudi表使用id和start_dt作为符合主键。

merge into hudi_test3 D using (select id,start_dt from tmp_tb2 where col2 in ('D', 'U')) N
on (N.id = D.id and N.start_dt = D.start_dt)
when matched then update set
D.id = N.id,
D.start_dt = N.start_dt,
D.end_dt = '2022-04-28';

merge into h0 as target
using (
select id, name, price, flag from s
) source
on target.id = source.id
when matched then update set *
when not matched then insert *;

merge into h0
using (
select id, name, price, flag from s
) source
on h0.id = source.id
when matched and flag != 'delete' then update set id = source.id, name = source.name, price = source.price * 2
when matched and flag = 'delete' then delete
when not matched then insert (id,name,price) values(source.id, source.name, source.price);

merge into t0 as target
using s0 source
on target.id = source.id
when matched then update set *
when not matched then insert *;

UPDATE

格式

update h0 set price = price + 20 where id = 1;
update h0 set price = price *2, name = 'a2' where id = 2;

DELETE

格式

DELETE from tableIdentifier [ WHERE boolExpression]

delete from h0 where column1 = 'country';

delete from h0 where column1 IN ('country1', 'country2');

delete from h0 where column1 IN (select column11 from sourceTable2);

delete from h0 where column1 IN (select column11 from sourceTable2 where column1 = 'USA');

delete from h0;

COMPACTION

缩( compaction)用于在 MergeOnRead表将基于行的log日志文件转化为parquet列式数据文件，用于加快记录的查找。

格式

schedule compaction  on h1;
show compaction on h1;
run compaction on h1 at 20210915170758;

schedule compaction  on '/tmp/hudi/h1';
run compaction on '/tmp/hudi/h1';

SAVE_POINT

管理Hudi表的savepoint。

格式
创建savepoint：

call create_savepoints('[table_name]', '[commit_Time]', '[user]', '[comments]');

查看所有存在的savepoint

call show_savepoints(table =>'[table_name]');

回滚savepoint：

call rollback_savepoints('[table_name]', '[commit_Time]');

王旭亮_

关注

18
点赞
踩
14

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录