CREATE TABLE
CREATE TABLE命令通过指定带有表属性的字段列表来创建Hudi Table。
格式:
CREATE TABLE [ IF NOT EXISTS] [database_name.]table_name
[ (columnTypeList)]
USING hudi
[ COMMENT table_comment ]
[ LOCATION location_path ]
[ OPTIONS (options_list) ]
创建非分区表
创建一个cow内部表
create table if not exists hudi_table0 (
id int,
name string,
price double
) using hudi
options (type = 'cow',
primaryKey = 'id'
);
创建一个mor外部表
create table if not exists hudi_table1 (
id int,
name string,
price double,
ts bigint
) using hudi
location '/tmp/hudi/hudi_table1'
options (
type = 'mor',
primaryKey = 'id,name',
preCombineField = 'ts'
);
创建分区表
create table if not exists hudi_table_p0 (
id bigint,
name string,
ts bigint,
dt string,
hh string
) using hudi
location '/tmp/hudi/hudi_table_p0'
options (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt, hh);
以SQL方式创建一个hudi表的外表,与spark-shell or deltastreamer方式创建的hudi外表相同
create table h_p1
using hudi
options (
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (dt)
location '/path/to/hudi';
创建表指定表属性
create table if not exists h3(
id bigint,
name string,
price double
) using hudi
options (
primaryKey = 'id',
type = 'mor',
hoodie.cleaner.fileversions.retained = '20',
hoodie.keep.max.commits = '20'
);
备注:Hudi当前不支持使用char、varchar、tinyint、smallint类型,建议使用string或int类型。
CREATE TABLE AS SELECT
CREATE TABLE As SELECT命令通过指定带有表属性的字段列表来创建Hudi Table。
格式:
CREATE TABLE [ IF NOT EXISTS] [database_name.]table_name
USING hudi
[ COMMENT table_comment ]
[ LOCATION location_path ]
[ OPTIONS (options_list) ]
[ AS query_statement ]
创建分区表
create table h2 using hudi
options (type = 'cow', primaryKey = 'id')
partitioned by (dt)
as
select 1 as id, 'a1' as name, 10 as price, 1000 as dt;
创建非分区表
create table h3 using hudi
as
select 1 as id, 'a1' as name, 10 as price;
从parquet表加载数据到hudi表
创建parquet表
create table parquet_mngd using parquet options(path=’hdfs:///tmp/parquet_dataset/*.parquet’);
CTAS创建hudi表
create table hudi_tbl using hudi location 'hdfs:///tmp/hudi/hudi_tbl/' options (
type = 'cow',
primaryKey = 'id',
preCombineField = 'ts'
)
partitioned by (datestr) as select * from parquet_mngd;
备注:为了更好的加载数据性能,CTAS使用bulk insert作为写入方式。
DROP TABLE
格式:
DROP TABLE [IF EXISTS] [db_name.]table_name;
删除表
DROP TABLE IF EXISTS hudidb.h1;
SHOW TABLE
展示当前库中所有表
SHOW TABLES IN hudidb;
ALTER RENAME TABLE
重命名表
ALTER TABLE oldTableName RENAME TO newTableName
alter table h0 rename to h0_1;
ALTER ADD COLUMNS
添加新列
ALTER TABLE tableIdentifier ADD COLUMNS(colAndType (,colAndType)*)
alter table h0_1 add columns(ext0 string);
TRUNCATE TABLE
清空表数据
TRUNCATE TABLE tableIdentifier
truncate table h0_1;
INSERT INTO
Hudi对于设置了主键的表支持三种Insert模式,默认为upsert。
1、trict模式,Insert 语句将保留 COW 表的主键唯一性约束,不允许重复记录。如果在插入过程中已经存在记录,则会为 COW 表抛出异常,对于MOR表,该模式与upsert模式行为一致。
2、non-strict模式,对主键表采用insert处理。
3、upsert模式,对于主键表的重复值进行更新操作。
insert into h0 select 1, 'a1', 20;
-- insert static partition
insert into h_p0 partition(dt = '2021-01-02') select 1, 'a1';
-- insert dynamic partition
insert into h_p0 select 1, 'a1', dt;
-- insert dynamic partition
insert into h_p1 select 1 as id, 'a1', '2021-01-03' as dt, '19' as hh;
-- insert overwrite table
insert overwrite table h0 select 1, 'a1', 20;
-- insert overwrite table with static partition
insert overwrite h_p0 partition(dt = '2021-01-02') select 1, 'a1';
-- insert overwrite table with dynamic partition
insert overwrite table h_p1 select 2 as id, 'a2', '2021-01-03' as dt, '19' as hh;
MERGE INTO
通过MERGE INTO命令,根据一张表或子查询的连接条件对另外一张表进行查询,连接条件匹配上的进行UPDATE或DELETE,无法匹配的执行INSERT。这个语法仅需要一次全表扫描就完成了全部同步工作,执行效率要高于INSERT+UPDATE。
格式:
MERGE INTO tableIdentifier AS target_alias
USING (sub_query | tableIdentifier) AS source_alias
ON <merge_condition>
[ WHEN MATCHED [ AND <condition> ] THEN <matched_action> ]
[ WHEN MATCHED [ AND <condition> ] THEN <matched_action> ]
[ WHEN NOT MATCHED [ AND <condition> ] THEN <not_matched_action> ]
<merge_condition> =A equal bool condition
<matched_action> =
DELETE |
UPDATE SET * |
UPDATE SET column1 = expression1 [, column2 = expression2 ...]
<not_matched_action> =
INSERT * |
INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])
1.merge-on condition当前只支持主键列。
2.当前仅支持对COW表进行部分字段的更新。
merge into h0 using s0
on h0.id = s0.id
when matched then update set h0.id = s0.id,price = s0.price * 2
3.当前仅支持对COW表进行更新时,目标表的字段出现在更新表达式的右值。
merge into h0 using s0 on h0.id = s0.id
when matched then update set id = s0.id,
name = h0.name,
price = s0.price + h0.price
4.如果针对部分字段进行更新,更新值必须包含主键列:Hudi表使用id和start_dt作为符合主键。
merge into hudi_test3 D using (select id,start_dt from tmp_tb2 where col2 in ('D', 'U')) N
on (N.id = D.id and N.start_dt = D.start_dt)
when matched then update set
D.id = N.id,
D.start_dt = N.start_dt,
D.end_dt = '2022-04-28';
merge into h0 as target
using (
select id, name, price, flag from s
) source
on target.id = source.id
when matched then update set *
when not matched then insert *;
merge into h0
using (
select id, name, price, flag from s
) source
on h0.id = source.id
when matched and flag != 'delete' then update set id = source.id, name = source.name, price = source.price * 2
when matched and flag = 'delete' then delete
when not matched then insert (id,name,price) values(source.id, source.name, source.price);
merge into t0 as target
using s0 source
on target.id = source.id
when matched then update set *
when not matched then insert *;
UPDATE
格式
update h0 set price = price + 20 where id = 1;
update h0 set price = price *2, name = 'a2' where id = 2;
DELETE
格式
DELETE from tableIdentifier [ WHERE boolExpression]
delete from h0 where column1 = 'country';
delete from h0 where column1 IN ('country1', 'country2');
delete from h0 where column1 IN (select column11 from sourceTable2);
delete from h0 where column1 IN (select column11 from sourceTable2 where column1 = 'USA');
delete from h0;
COMPACTION
缩( compaction)用于在 MergeOnRead表将基于行的log日志文件转化为parquet列式数据文件,用于加快记录的查找。
格式
schedule compaction on h1;
show compaction on h1;
run compaction on h1 at 20210915170758;
schedule compaction on '/tmp/hudi/h1';
run compaction on '/tmp/hudi/h1';
SAVE_POINT
管理Hudi表的savepoint。
格式
创建savepoint:
call create_savepoints('[table_name]', '[commit_Time]', '[user]', '[comments]');
查看所有存在的savepoint
call show_savepoints(table =>'[table_name]');
回滚savepoint:
call rollback_savepoints('[table_name]', '[commit_Time]');