日切表转拉链表初始化,分组排序

1.需求简略描述

需要将一个日切片表数据转换为拉链数据,其中remark字段数值变化无规律且可能与历史值相同,相邻日分区相同的remark值整合到同一拉链区间,拉链数据end_dt='29991231'为开链区间

idremarkstat_dtid remarkstart_dtend_dt
1a202307011a2023070120230703
1a202307021b2023070320230706
1b202307031a2023070620230707
1b202307041b2023070720230708
1b202307051a2023070829991231
1a202307062a2023070120230706
1b202307072b2023070620230707
1a202307082a2023070729991231
1a20230709
2a20230701
2a20230702
2a20230703
2a20230704
2a20230705
2b20230706
2a20230707
2a20230708
2a20230709
2.问题

若直接按照id,remark进行group by 取stat_dt最大最小值,后续处理略(参考4.原需求完整代码),产生的结果会出现区间重叠

id remarkstart_dtend_dt
1a2023070120230709
1b2023070329991231
2a2023070120230709
2b2023070629991231
3.处理方案

分别以id和id+remark分组排序获取rn_id和rn_col,再通过rn_id和rn_col差值获取分组并列排序值rn,可得相邻日分区remark值相同时rn相等,相邻日分区remark值不同时rn跳跃递增,在按照id,remark,rn进行group by 处理,后续处理略(参考4.原需求完整代码)

idremarkstat_dtrn_idrn_rmkrn
1a20230701110
1a20230702220
1b20230703312
1b20230704422
1b20230705532
1a20230706615
1b20230707716
1a20230708817
1a20230709927
2a20230701110
2a20230702220
2a20230703330
2a20230704440
2a20230705550
2b20230706615
2a20230707716
2a20230708826
2a20230709936
4.原需求完整代码
-- # 任务名称:日切表转拉链表数据初始化
-- # 需求编号:
-- # 任务功能说明:
-- # task_id 为业务系统跑批id,gmt_updated和task_id变化不进行闭链开链
-- # ------------------------------------------------------------------------------------------------
-- # 目标表:dwd_table_name_dz
-- # 源表:
-- # 01. ods_table_name_df
-- # ------------------------------------------------------------------------------------------------
-- # 创建人:Lemon.Warn 创建时间:2023.07.28
-- # 修改 - 01 :
-- #     修改人:    修改时间:
-- #     需求编号:
-- #     修改内容:
-- # ------------------------------------------------------------------------------------------------


-- 清空拉链表
TRUNCATE TABLE ods_db.dwd_table_name_dz;

-- 清空临时表
DROP TABLE IF EXISTS tmp_table_name_001;
DROP TABLE IF EXISTS tmp_table_name_002;
DROP TABLE IF EXISTS tmp_table_name_003;
DROP TABLE IF EXISTS tmp_table_name_004;
DROP TABLE IF EXISTS tmp_table_name_005;

-- remark内容多次改动,当前值可能与历史值重复,按id和多字段分组排序,方便相同字段值不同时间段的切割
CREATE TABLE tmp_table_name_001 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       task_id,
       extension,
       gmt_created,
       gmt_creator,
       gmt_updated,
       gmt_updater,
       deleted,
       coalesce(stat_dt_s, stat_dt)                                                                   as stat_dt,
       ROW_NUMBER() OVER (PARTITION BY id ORDER BY gmt_updated ASC, coalesce(stat_dt_s, stat_dt) ASC) as rn_id,
       ROW_NUMBER() OVER (
           PARTITION BY
               id,
               tenant_id,
               tenant,
               member_user_id,
               external_user_id,
               description,
               remark,
               create_time,
               remark_corp_name,
               remark_mobiles,
               add_way,
               oper_userid,
               state,
               extension,
               gmt_created,
               gmt_creator,
               deleted
           ORDER BY gmt_updated ASC, coalesce(stat_dt_s, stat_dt) ASC)                                as rn_col
FROM ods_db.ods_table_name_df
;

-- 相同字段值不同时间段的切割
CREATE TABLE tmp_table_name_002 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       task_id,
       extension,
       gmt_created,
       gmt_creator,
       gmt_updated,
       gmt_updater,
       deleted,
       stat_dt,
       rn_id,
       rn_col,
       rn_id - rn_col as rn
FROM tmp_table_name_001
;

-- 去重,获取最最大最小值
CREATE TABLE tmp_table_name_003 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       max(task_id)     as task_id,         -- task_id取最新,即取最大
       extension,
       gmt_created,
       gmt_creator,
       max(gmt_updated) as gmt_updated,     -- gmt_updated取最新,即取最大
       max(gmt_updater) as gmt_updater,
       deleted,
       max(stat_dt)     as stat_dt,
       min(gmt_updated) as min_gmt_updated, --  取最早更新时间
       min(stat_dt)     as min_stat_dt      --  取最早更新时间
FROM tmp_table_name_002
GROUP BY id, tenant_id, tenant, member_user_id, external_user_id, description
       , remark, create_time, remark_corp_name, remark_mobiles, add_way, oper_userid
       , state, extension, gmt_created, gmt_creator, deleted, rn;

-- 去重,获取最最大最小值
CREATE TABLE tmp_table_name_004 stored as parquet AS
SELECT id                    as id,
       max(tenant_id)        as tenant_id,
       max(tenant)           as tenant,
       max(member_user_id)   as member_user_id,
       max(external_user_id) as external_user_id,
       max(description)      as description,
       max(remark)           as remark,
       max(create_time)      as create_time,
       max(remark_corp_name) as remark_corp_name,
       max(remark_mobiles)   as remark_mobiles,
       max(add_way)          as add_way,
       max(oper_userid)      as oper_userid,
       max(state)            as state,
       max(task_id)          as task_id,
       max(extension)        as extension,
       case
           when min(gmt_created) > gmt_updated then gmt_updated
           else min(gmt_created)
           end               as gmt_created, -- 如果来源表中存在 gmt_created>gmt_updated的情况,需要做gmt_created=gmt_updated 前置处理
       max(gmt_creator)      as gmt_creator,
       gmt_updated           as gmt_updated,
       max(gmt_updater)      as gmt_updater,
       max(deleted)          as deleted,
       max(stat_dt)          as stat_dt,
       min(min_gmt_updated)  as min_gmt_updated,
       min(min_stat_dt)      as min_stat_dt
FROM tmp_table_name_003
GROUP BY id, gmt_updated;

-- 分组排序
CREATE TABLE tmp_table_name_005 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       task_id,
       extension,
       min(gmt_created) over (partition by id)                      as gmt_created, -- 统一为最小创建时间,避免多条记录创建时间不一致
       gmt_creator,
       gmt_updated,
       gmt_updater,
       deleted,
       stat_dt,
       min_gmt_updated,
       min_stat_dt,
       ROW_NUMBER() OVER (PARTITION BY id ORDER BY gmt_updated ASC) as rn,
       count(1) OVER (PARTITION BY id )                             as max_rn       -- 当前分组最大序号
FROM tmp_table_name_004;

-- 开链数据 及 被删除的闭链数据
INSERT OVERWRITE TABLE ods_db.dwd_table_name_dz partition (end_dt)
SELECT id                                                                   as id,
       tenant_id                                                            as tenant_id,
       tenant                                                               as tenant,
       member_user_id                                                       as member_user_id,
       external_user_id                                                     as external_user_id,
       description                                                          as description,
       remark                                                               as remark,
       create_time                                                          as create_time,
       remark_corp_name                                                     as remark_corp_name,
       remark_mobiles                                                       as remark_mobiles,
       add_way                                                              as add_way,
       oper_userid                                                          as oper_userid,
       state                                                                as state,
       task_id                                                              as task_id,
       extension                                                            as extension,
       gmt_created                                                          as gmt_created,
       gmt_creator                                                          as gmt_creator,
       gmt_updated                                                          as gmt_updated,
       gmt_updater                                                          as gmt_updater,
       deleted                                                              as deleted,
       CASE
           WHEN REGEXP_REPLACE(SUBSTR(min_gmt_updated, 1, 10), '-', '') > min_stat_dt then min_stat_dt
           ELSE REGEXP_REPLACE(SUBSTR(min_gmt_updated, 1, 10), '-', '') END as start_dt, -- case when规避 bizdate+1 的凌晨那部分数据;min_字段获取最早更新时间
       CASE
           WHEN stat_dt < '${bizdate}' THEN regexp_replace(
                   cast(date_add(concat(substr(stat_dt, 1, 4), '-', substr(stat_dt, 5, 2), '-',
                                        substr(stat_dt, 7, 2)),
                                 1) as string), '-', '')
           ELSE '29991231' END                                              as end_dt    -- 如果为闭链数据 end_dt=最新stat_dt+1(不然与正常拉链处理逻辑不符)
FROM tmp_table_name_005 t1
WHERE rn = max_rn;
-- 取最新一条

-- 闭链数据
INSERT INTO TABLE ods_db.dwd_table_name_dz partition (end_dt)
SELECT t1.id                                                                   as id,
       t1.tenant_id                                                            as tenant_id,
       t1.tenant                                                               as tenant,
       t1.member_user_id                                                       as member_user_id,
       t1.external_user_id                                                     as external_user_id,
       t1.description                                                          as description,
       t1.remark                                                               as remark,
       t1.create_time                                                          as create_time,
       t1.remark_corp_name                                                     as remark_corp_name,
       t1.remark_mobiles                                                       as remark_mobiles,
       t1.add_way                                                              as add_way,
       t1.oper_userid                                                          as oper_userid,
       t1.state                                                                as state,
       t1.task_id                                                              as task_id,
       t1.extension                                                            as extension,
       t1.gmt_created                                                          as gmt_created,
       t1.gmt_creator                                                          as gmt_creator,
       t1.gmt_updated                                                          as gmt_updated,
       t1.gmt_updater                                                          as gmt_updater,
       t1.deleted                                                              as deleted,
       CASE
           WHEN REGEXP_REPLACE(SUBSTR(t1.min_gmt_updated, 1, 10), '-', '') > t1.min_stat_dt THEN t1.min_stat_dt
           ELSE REGEXP_REPLACE(SUBSTR(t1.min_gmt_updated, 1, 10), '-', '') END as start_dt, -- case when规避 bizdate+1 的凌晨那部分数据;min_字段获取最早更新时间
       CASE
           WHEN REGEXP_REPLACE(SUBSTR(t2.min_gmt_updated, 1, 10), '-', '') > t2.min_stat_dt THEN t2.min_stat_dt
           ELSE REGEXP_REPLACE(SUBSTR(t2.min_gmt_updated, 1, 10), '-', '') END as end_dt    -- case when规避 bizdate+1 的凌晨那部分数据;min_字段获取最早更新时间
FROM tmp_table_name_005 t1
         JOIN tmp_table_name_005 t2 ON t1.id = t2.id AND t1.rn = t2.rn - 1;

-- 清空临时表  --
DROP TABLE IF EXISTS tmp_table_name_001;
DROP TABLE IF EXISTS tmp_table_name_002;
DROP TABLE IF EXISTS tmp_table_name_003;
DROP TABLE IF EXISTS tmp_table_name_004;
DROP TABLE IF EXISTS tmp_table_name_005;

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值