参考链接:
代码部分:
CREATE TABLE source_table(
userid string,
loginname string,
regiondate string,
phonenum string,
birthday string,
status string,
lastlogindate string)
PARTITIONED BY (datatime string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;
load data local INPATH '/tmp/xinniu/20210101' into table source_table partition (datatime='20210101');
load data local INPATH '/tmp/xinniu/20210102' into table source_table partition (datatime='20210102');
load data local INPATH '/tmp/xinniu/20210103' into table source_table partition (datatime='20210103');
CREATE TABLE zip_table(
userid string,
loginname string,
regiondate string,
phonenum string,
birthday string,
status string,
lastlogindate string,
datatime string,
etltimestamp string,
starttime string,
endtime string,
mark string)
-- 创建一张拉链表的备份表 备份拉链表历史开链数据
CREATE TABLE IF NOT EXISTS xinniu.zip_table_bk stored AS orc tblproperties ("orc.compress" = "SNAPPY") AS
SELECT
userID,
loginName,
regionDate,
phoneNum,
birthday,
status,
lastLoginDate,
dataTime,
etlTimestamp ,
startTime ,
endTime ,
mark
FROM
xinniu.zip_table
WHERE
1 = 0 ;
-- 将拉链表历史开链数据插入到bk备份表中
-- 卡拉链条件:startTime < to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd'))) AND endTime >= to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
-- 备份表非空判断:(select count(1) from xinniu.zip_table_bk limit 1) = 0 判断备份表非空才插入 此处必须判空 不能使用drop或者truncate的方式清空备份表 会导致失败重跑时丢数
INSERT
INTO
xinniu.zip_table_bk
SELECT
userID,
loginName,
regionDate,
phoneNum,
birthday,
status,
lastLoginDate,
dataTime,
etlTimestamp ,
startTime ,
endTime ,
mark
FROM
xinniu.zip_table
join (select count(1) cnt from xinniu.zip_table_bk limit 1) b
WHERE
startTime < to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
AND endTime >= to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
AND b.cnt = 0
;
-- 创建拉链表闭链数据备份表bf
CREATE TABLE IF NOT EXISTS xinniu.zip_table_bf stored AS orc tblproperties ("orc.compress" = "SNAPPY") AS
SELECT
userID,
loginName,
regionDate,
phoneNum,
birthday,
status,
lastLoginDate,
dataTime,
etlTimestamp ,
startTime ,
endTime ,
mark
FROM
xinniu.zip_table
WHERE
1 = 0 ;
-- 备份拉链表中历史闭链数据 卡拉链条件:endTime < to_date(to_timestamp('${hiveconf:batch_date}' , 'yyyyMMdd'))
-- 备份表非空判断:(select count(1) from xinniu.zip_table_bf limit 1) = 0 同上,不能使用drop或者truncate的方式清空备份表
INSERT
INTO
xinniu.zip_table_bf
SELECT
userID,
loginName,
regionDate,
phoneNum,
birthday,
status,
lastLoginDate,
dataTime,
etlTimestamp ,
startTime ,
endTime ,
mark
FROM
xinniu.zip_table
join
(
SELECT
count(1) cnt
FROM
xinniu.zip_table_bf
LIMIT 1) b
WHERE
endTime < to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
AND b.cnt = 0 ;
-- 中间加工表清空
DROP TABLE IF EXISTS xinniu.zip_table_nw;
DROP TABLE IF EXISTS xinniu.zip_table_od;
-- 创建中间表 新增变化修改中间表
CREATE TABLE IF NOT EXISTS xinniu.zip_table_nw stored AS orc tblproperties ("orc.compress" = "SNAPPY") AS
SELECT
userID,
loginName,
regionDate,
phoneNum,
birthday,
status,
lastLoginDate,
dataTime,
etlTimestamp ,
startTime ,
endTime ,
mark
FROM
xinniu.zip_table
WHERE
0 = 1;
-- 创建中间表 未变化中间表
CREATE TABLE IF NOT EXISTS xinniu.zip_table_od stored AS orc tblproperties ("orc.compress" = "SNAPPY") AS
SELECT
userID,
loginName,
regionDate,
phoneNum,
birthday,
status,
lastLoginDate,
dataTime,
etlTimestamp ,
startTime ,
endTime ,
mark
FROM
xinniu.zip_table
WHERE
0 = 1;
-- 新增、修改、删除变化数据插入变化中间表nw
-- 原始数据表与拉链表进行full join关联,通过主键is_pk关联 根据不同情况生成对应的startTime与endTime及mark三个技术字段
-- 字段值选择原始表与目标表的非空字段值 nvl(n.@{source_column_names}, o.@{xinniu.zip_table_column_names})
-- 本逻辑中mark分为I、D两种,新增与修改为I,删除为D
INSERT
INTO
TABLE xinniu.zip_table_nw
SELECT
nvl(n.userID,o.userID) ,
nvl(n.loginName,o.loginName) ,
nvl(n.regionDate,o.regionDate) ,
nvl(n.phoneNum,o.phoneNum) ,
nvl(n.birthday,o.birthday) ,
nvl(n.status,o.status) ,
nvl(n.lastLoginDate,o.lastLoginDate) ,
nvl(n.dataTime,o.dataTime) ,
current_date AS etlTimestamp ,
CASE
WHEN n.dataTime IS NULL THEN o.startTime
ELSE to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
END AS startTime ,
CASE
WHEN n.dataTime IS NULL THEN to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
ELSE to_date(from_unixtime(unix_timestamp('29991231' ,'yyyyMMdd')))
END AS endTime ,
CASE
WHEN ( n.userID is null ) THEN 'D'
ELSE 'I'
END AS mark
FROM
(
SELECT
userID,
loginName,
regionDate,
phoneNum,
birthday,
status,
lastLoginDate,
dataTime
FROM
xinniu.source_table
WHERE
dataTime = '${hiveconf:batch_date}' ) n
FULL JOIN xinniu.zip_table_bk o ON
o.userID = n.userID
WHERE
(
o.userID IS NULL )
OR (
n.userID IS NULL )
OR (
nvl( CAST(o.phoneNum AS string) , '' ) <> nvl( CAST(n.phoneNum AS string) , '' )
OR nvl( CAST(o.status AS string) , '' ) <> nvl( CAST(n.status AS string) , '' )
)
;
-- 闭链发生变化的数据 endTime改为hiveconf:batch_date
-- 未变化数据保持原来状态 新增与修改状态统一"I"
-- 发生变化的endTime逻辑:when n.startTime is not null then to_date(to_timestamp('${hiveconf:batch_date}' , 'yyyyMMdd'))
-- 没发生变化的endTime逻辑:when o.endTime >= to_date(to_timestamp('${hiveconf:batch_date}' , 'yyyyMMdd')) then to_date(to_timestamp('29991231','yyyyMMdd'))
INSERT
INTO
TABLE xinniu.zip_table_od
SELECT
o.userID,
o.loginName,
o.regionDate,
o.phoneNum,
o.birthday,
o.status,
o.lastLoginDate,
o.dataTime,
o.etlTimestamp ,
o.startTime ,
CASE
WHEN n.startTime IS NOT NULL THEN to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
WHEN o.endTime >= to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
THEN to_date(from_unixtime(unix_timestamp('29991231' ,'yyyyMMdd')))
ELSE o.endTime
END AS endTime ,
'I' AS mark
FROM
xinniu.zip_table_bk o
LEFT JOIN xinniu.zip_table_nw n ON
o.userID = n.userID
WHERE
nvl(n.endTime,to_date(from_unixtime(unix_timestamp('29991231' ,'yyyyMMdd')))) <> to_date(from_unixtime(unix_timestamp('${hiveconf:batch_date}' ,'yyyyMMdd')))
;
-- 清空拉链表
TRUNCATE TABLE xinniu.zip_table;
-- 插入数据到拉链表
INSERT
INTO
TABLE xinniu.zip_table
SELECT
*
FROM
xinniu.zip_table_nw
UNION ALL
SELECT
*
FROM
xinniu.zip_table_od
UNION ALL
SELECT
*
FROM
xinniu.zip_table_bf ;
-- 清空临时表
DROP TABLE xinniu.zip_table_bk;
DROP TABLE xinniu.zip_table_bf;
DROP TABLE xinniu.zip_table_nw;
DROP TABLE xinniu.zip_table_od;
hive -hiveconf batch_date=20210101 -f /tmp/xinniu/sqlfile && hive -hiveconf batch_date=20210102 -f /tmp/xinniu/sqlfile && hive -hiveconf batch_date=20210103 -f /tmp/xinniu/sqlfile