1.需求简略描述
需要将一个日切片表数据转换为拉链数据,其中remark字段数值变化无规律且可能与历史值相同,相邻日分区相同的remark值整合到同一拉链区间,拉链数据end_dt='29991231'为开链区间
id | remark | stat_dt | id | remark | start_dt | end_dt | |
1 | a | 20230701 | 1 | a | 20230701 | 20230703 | |
1 | a | 20230702 | 1 | b | 20230703 | 20230706 | |
1 | b | 20230703 | 1 | a | 20230706 | 20230707 | |
1 | b | 20230704 | 1 | b | 20230707 | 20230708 | |
1 | b | 20230705 | 1 | a | 20230708 | 29991231 | |
1 | a | 20230706 | 2 | a | 20230701 | 20230706 | |
1 | b | 20230707 | 2 | b | 20230706 | 20230707 | |
1 | a | 20230708 | 2 | a | 20230707 | 29991231 | |
1 | a | 20230709 | |||||
2 | a | 20230701 | |||||
2 | a | 20230702 | |||||
2 | a | 20230703 | |||||
2 | a | 20230704 | |||||
2 | a | 20230705 | |||||
2 | b | 20230706 | |||||
2 | a | 20230707 | |||||
2 | a | 20230708 | |||||
2 | a | 20230709 |
2.问题
若直接按照id,remark进行group by 取stat_dt最大最小值,后续处理略(参考4.原需求完整代码),产生的结果会出现区间重叠
id | remark | start_dt | end_dt |
1 | a | 20230701 | 20230709 |
1 | b | 20230703 | 29991231 |
2 | a | 20230701 | 20230709 |
2 | b | 20230706 | 29991231 |
3.处理方案
分别以id和id+remark分组排序获取rn_id和rn_col,再通过rn_id和rn_col差值获取分组并列排序值rn,可得相邻日分区remark值相同时rn相等,相邻日分区remark值不同时rn跳跃递增,在按照id,remark,rn进行group by 处理,后续处理略(参考4.原需求完整代码)
id | remark | stat_dt | rn_id | rn_rmk | rn |
1 | a | 20230701 | 1 | 1 | 0 |
1 | a | 20230702 | 2 | 2 | 0 |
1 | b | 20230703 | 3 | 1 | 2 |
1 | b | 20230704 | 4 | 2 | 2 |
1 | b | 20230705 | 5 | 3 | 2 |
1 | a | 20230706 | 6 | 1 | 5 |
1 | b | 20230707 | 7 | 1 | 6 |
1 | a | 20230708 | 8 | 1 | 7 |
1 | a | 20230709 | 9 | 2 | 7 |
2 | a | 20230701 | 1 | 1 | 0 |
2 | a | 20230702 | 2 | 2 | 0 |
2 | a | 20230703 | 3 | 3 | 0 |
2 | a | 20230704 | 4 | 4 | 0 |
2 | a | 20230705 | 5 | 5 | 0 |
2 | b | 20230706 | 6 | 1 | 5 |
2 | a | 20230707 | 7 | 1 | 6 |
2 | a | 20230708 | 8 | 2 | 6 |
2 | a | 20230709 | 9 | 3 | 6 |
4.原需求完整代码
-- # 任务名称:日切表转拉链表数据初始化
-- # 需求编号:
-- # 任务功能说明:
-- # task_id 为业务系统跑批id,gmt_updated和task_id变化不进行闭链开链
-- # ------------------------------------------------------------------------------------------------
-- # 目标表:dwd_table_name_dz
-- # 源表:
-- # 01. ods_table_name_df
-- # ------------------------------------------------------------------------------------------------
-- # 创建人:Lemon.Warn 创建时间:2023.07.28
-- # 修改 - 01 :
-- # 修改人: 修改时间:
-- # 需求编号:
-- # 修改内容:
-- # ------------------------------------------------------------------------------------------------
-- 清空拉链表
TRUNCATE TABLE ods_db.dwd_table_name_dz;
-- 清空临时表
DROP TABLE IF EXISTS tmp_table_name_001;
DROP TABLE IF EXISTS tmp_table_name_002;
DROP TABLE IF EXISTS tmp_table_name_003;
DROP TABLE IF EXISTS tmp_table_name_004;
DROP TABLE IF EXISTS tmp_table_name_005;
-- remark内容多次改动,当前值可能与历史值重复,按id和多字段分组排序,方便相同字段值不同时间段的切割
CREATE TABLE tmp_table_name_001 stored as parquet AS
SELECT id,
tenant_id,
tenant,
member_user_id,
external_user_id,
description,
remark,
create_time,
remark_corp_name,
remark_mobiles,
add_way,
oper_userid,
state,
task_id,
extension,
gmt_created,
gmt_creator,
gmt_updated,
gmt_updater,
deleted,
coalesce(stat_dt_s, stat_dt) as stat_dt,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY gmt_updated ASC, coalesce(stat_dt_s, stat_dt) ASC) as rn_id,
ROW_NUMBER() OVER (
PARTITION BY
id,
tenant_id,
tenant,
member_user_id,
external_user_id,
description,
remark,
create_time,
remark_corp_name,
remark_mobiles,
add_way,
oper_userid,
state,
extension,
gmt_created,
gmt_creator,
deleted
ORDER BY gmt_updated ASC, coalesce(stat_dt_s, stat_dt) ASC) as rn_col
FROM ods_db.ods_table_name_df
;
-- 相同字段值不同时间段的切割
CREATE TABLE tmp_table_name_002 stored as parquet AS
SELECT id,
tenant_id,
tenant,
member_user_id,
external_user_id,
description,
remark,
create_time,
remark_corp_name,
remark_mobiles,
add_way,
oper_userid,
state,
task_id,
extension,
gmt_created,
gmt_creator,
gmt_updated,
gmt_updater,
deleted,
stat_dt,
rn_id,
rn_col,
rn_id - rn_col as rn
FROM tmp_table_name_001
;
-- 去重,获取最最大最小值
CREATE TABLE tmp_table_name_003 stored as parquet AS
SELECT id,
tenant_id,
tenant,
member_user_id,
external_user_id,
description,
remark,
create_time,
remark_corp_name,
remark_mobiles,
add_way,
oper_userid,
state,
max(task_id) as task_id, -- task_id取最新,即取最大
extension,
gmt_created,
gmt_creator,
max(gmt_updated) as gmt_updated, -- gmt_updated取最新,即取最大
max(gmt_updater) as gmt_updater,
deleted,
max(stat_dt) as stat_dt,
min(gmt_updated) as min_gmt_updated, -- 取最早更新时间
min(stat_dt) as min_stat_dt -- 取最早更新时间
FROM tmp_table_name_002
GROUP BY id, tenant_id, tenant, member_user_id, external_user_id, description
, remark, create_time, remark_corp_name, remark_mobiles, add_way, oper_userid
, state, extension, gmt_created, gmt_creator, deleted, rn;
-- 去重,获取最最大最小值
CREATE TABLE tmp_table_name_004 stored as parquet AS
SELECT id as id,
max(tenant_id) as tenant_id,
max(tenant) as tenant,
max(member_user_id) as member_user_id,
max(external_user_id) as external_user_id,
max(description) as description,
max(remark) as remark,
max(create_time) as create_time,
max(remark_corp_name) as remark_corp_name,
max(remark_mobiles) as remark_mobiles,
max(add_way) as add_way,
max(oper_userid) as oper_userid,
max(state) as state,
max(task_id) as task_id,
max(extension) as extension,
case
when min(gmt_created) > gmt_updated then gmt_updated
else min(gmt_created)
end as gmt_created, -- 如果来源表中存在 gmt_created>gmt_updated的情况,需要做gmt_created=gmt_updated 前置处理
max(gmt_creator) as gmt_creator,
gmt_updated as gmt_updated,
max(gmt_updater) as gmt_updater,
max(deleted) as deleted,
max(stat_dt) as stat_dt,
min(min_gmt_updated) as min_gmt_updated,
min(min_stat_dt) as min_stat_dt
FROM tmp_table_name_003
GROUP BY id, gmt_updated;
-- 分组排序
CREATE TABLE tmp_table_name_005 stored as parquet AS
SELECT id,
tenant_id,
tenant,
member_user_id,
external_user_id,
description,
remark,
create_time,
remark_corp_name,
remark_mobiles,
add_way,
oper_userid,
state,
task_id,
extension,
min(gmt_created) over (partition by id) as gmt_created, -- 统一为最小创建时间,避免多条记录创建时间不一致
gmt_creator,
gmt_updated,
gmt_updater,
deleted,
stat_dt,
min_gmt_updated,
min_stat_dt,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY gmt_updated ASC) as rn,
count(1) OVER (PARTITION BY id ) as max_rn -- 当前分组最大序号
FROM tmp_table_name_004;
-- 开链数据 及 被删除的闭链数据
INSERT OVERWRITE TABLE ods_db.dwd_table_name_dz partition (end_dt)
SELECT id as id,
tenant_id as tenant_id,
tenant as tenant,
member_user_id as member_user_id,
external_user_id as external_user_id,
description as description,
remark as remark,
create_time as create_time,
remark_corp_name as remark_corp_name,
remark_mobiles as remark_mobiles,
add_way as add_way,
oper_userid as oper_userid,
state as state,
task_id as task_id,
extension as extension,
gmt_created as gmt_created,
gmt_creator as gmt_creator,
gmt_updated as gmt_updated,
gmt_updater as gmt_updater,
deleted as deleted,
CASE
WHEN REGEXP_REPLACE(SUBSTR(min_gmt_updated, 1, 10), '-', '') > min_stat_dt then min_stat_dt
ELSE REGEXP_REPLACE(SUBSTR(min_gmt_updated, 1, 10), '-', '') END as start_dt, -- case when规避 bizdate+1 的凌晨那部分数据;min_字段获取最早更新时间
CASE
WHEN stat_dt < '${bizdate}' THEN regexp_replace(
cast(date_add(concat(substr(stat_dt, 1, 4), '-', substr(stat_dt, 5, 2), '-',
substr(stat_dt, 7, 2)),
1) as string), '-', '')
ELSE '29991231' END as end_dt -- 如果为闭链数据 end_dt=最新stat_dt+1(不然与正常拉链处理逻辑不符)
FROM tmp_table_name_005 t1
WHERE rn = max_rn;
-- 取最新一条
-- 闭链数据
INSERT INTO TABLE ods_db.dwd_table_name_dz partition (end_dt)
SELECT t1.id as id,
t1.tenant_id as tenant_id,
t1.tenant as tenant,
t1.member_user_id as member_user_id,
t1.external_user_id as external_user_id,
t1.description as description,
t1.remark as remark,
t1.create_time as create_time,
t1.remark_corp_name as remark_corp_name,
t1.remark_mobiles as remark_mobiles,
t1.add_way as add_way,
t1.oper_userid as oper_userid,
t1.state as state,
t1.task_id as task_id,
t1.extension as extension,
t1.gmt_created as gmt_created,
t1.gmt_creator as gmt_creator,
t1.gmt_updated as gmt_updated,
t1.gmt_updater as gmt_updater,
t1.deleted as deleted,
CASE
WHEN REGEXP_REPLACE(SUBSTR(t1.min_gmt_updated, 1, 10), '-', '') > t1.min_stat_dt THEN t1.min_stat_dt
ELSE REGEXP_REPLACE(SUBSTR(t1.min_gmt_updated, 1, 10), '-', '') END as start_dt, -- case when规避 bizdate+1 的凌晨那部分数据;min_字段获取最早更新时间
CASE
WHEN REGEXP_REPLACE(SUBSTR(t2.min_gmt_updated, 1, 10), '-', '') > t2.min_stat_dt THEN t2.min_stat_dt
ELSE REGEXP_REPLACE(SUBSTR(t2.min_gmt_updated, 1, 10), '-', '') END as end_dt -- case when规避 bizdate+1 的凌晨那部分数据;min_字段获取最早更新时间
FROM tmp_table_name_005 t1
JOIN tmp_table_name_005 t2 ON t1.id = t2.id AND t1.rn = t2.rn - 1;
-- 清空临时表 --
DROP TABLE IF EXISTS tmp_table_name_001;
DROP TABLE IF EXISTS tmp_table_name_002;
DROP TABLE IF EXISTS tmp_table_name_003;
DROP TABLE IF EXISTS tmp_table_name_004;
DROP TABLE IF EXISTS tmp_table_name_005;