hvie merge into 批量更新
1、使用条件
hive2.2.0及之后的版本支持使用merge into 语法,使用源表数据批量目标表的数据。使用该功能还需做如下配置
1、参数配置
set hive.support.concurrency = true;
set hive.enforce.bucketing = true;
set hive.exec.dynamic.partition.mode = nonstrict;
set hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on = true;
set hive.compactor.worker.threads = 1;
set hive.auto.convert.join=false;
set hive.merge.cardinality.check=false; -- 目标表中出现重复匹配时要设置该参数才行
2、建表要求
创建两个表,一个作为merge的目标,一个作为merge的源。 请注意设置为启用事务并以orc格式存储。
(官网和其他博客都写:目标表必须加bucket,在本地测试发现目标表不用加 bucket , 本地hive版本是3.1.0 )
2.1 建表不加 bucket
2.1.1 建表
create table merge_data;
drop table merge_data.transactions;
CREATE TABLE merge_data.transactions(
ID int,
TranValue string,
last_update_user string,
tran_date string)
STORED AS ORC
TBLPROPERTIES ('transactional'='true');
drop table merge_data.merge_source;
CREATE TABLE merge_data.merge_source(
ID int,
TranValue string,
tran_date string)
STORED AS ORC;
2.1.2 将用一些数据填充目标和源表
INSERT INTO merge_data.transactions VALUES
(1, 'value_01', 'creation', '20170410'),
(2, 'value_02', 'creation', '20170410'),
(3, 'value_03', 'creation', '20170410'),
(4, 'value_04', 'creation', '20170410'),
(5, 'value_05', 'creation', '20170413'),
(6, 'value_06', 'creation', '20170413'),
(7, 'value_07', 'creation', '20170413'),
(8, 'value_08', 'creation', '20170413'),
(9, 'value_09', 'creation', '20170413'),
(10, 'value_10','creation', '20170413');
INSERT INTO merge_data.merge_source VALUES
(1, 'value_01', '20170410'),
(4, NULL, '20170410'),
(7, 'value_77777', '20170413'),
(8, NULL, '20170413'),
(8, 'value_08', '20170415'),
(11, 'value_11', '20170415');
2.1.3、批量更新语法
udate 语句后要跟新的字段,左边字段(被更新字段)不能加表 "别名."
MERGE INTO merge_data.transactions AS T
USING merge_data.merge_source AS S
ON T.ID = S.ID and T.tran_date = S.tran_date
WHEN MATCHED AND (T.TranValue != S.TranValue AND S.TranValue IS NOT NULL) THEN UPDATE SET TranValue = S.TranValue, last_update_user = 'merge_update'
WHEN MATCHED AND S.TranValue IS NULL THEN DELETE
WHEN NOT MATCHED THEN INSERT VALUES (S.ID, S.TranValue, 'merge_insert', S.tran_date);
2.2 建表不加 bucket
2.2.1 建表
create table merge_data;
drop table merge_data.transactions;
CREATE TABLE merge_data.transactions(
ID int,
TranValue string,
last_update_user string)
PARTITIONED BY (tran_date string)
CLUSTERED BY (ID) into 5 buckets
STORED AS ORC TBLPROPERTIES ('transactional'='true');
drop table merge_data.merge_source;
CREATE TABLE merge_data.merge_source(
ID int,
TranValue string,
tran_date string)
STORED AS ORC;
2.2.1 将用一些数据填充目标和源表
INSERT INTO merge_data.transactions PARTITION (tran_date) VALUES
(1, 'value_01', 'creation', '20170410'),
(2, 'value_02', 'creation', '20170410'),
(3, 'value_03', 'creation', '20170410'),
(4, 'value_04', 'creation', '20170410'),
(5, 'value_05', 'creation', '20170413'),
(6, 'value_06', 'creation', '20170413'),
(7, 'value_07', 'creation', '20170413'),
(8, 'value_08', 'creation', '20170413'),
(9, 'value_09', 'creation', '20170413'),
(10, 'value_10','creation', '20170413');
INSERT INTO merge_data.merge_source VALUES
(1, 'value_01', '20170410'),
(4, NULL, '20170410'),
(7, 'value_77777', '20170413'),
(8, NULL, '20170413'),
(8, 'value_08', '20170415'),
(11, 'value_11', '20170415');
2.1.3、批量更新语法
udate 语句后要跟新的字段,左边字段(被更新字段)不能加表 "别名."
MERGE INTO merge_data.transactions AS T
USING merge_data.merge_source AS S
ON T.ID = S.ID and T.tran_date = S.tran_date
WHEN MATCHED AND (T.TranValue != S.TranValue AND S.TranValue IS NOT NULL) THEN UPDATE SET TranValue = S.TranValue, last_update_user = 'merge_update'
WHEN MATCHED AND S.TranValue IS NULL THEN DELETE
WHEN NOT MATCHED THEN INSERT VALUES (S.ID, S.TranValue, 'merge_insert', S.tran_date);
转载出处:
https://www.bbsmax.com/A/kPzO38v8zx/
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-Examples.1 (官网wiki)
https://community.cloudera.com/t5/Community-Articles/Hive-ACID-Merge-by-Example/ta-p/245402