日切表转拉链表初始化，分组排序

Lemon.Warn

已于 2023-07-28 11:18:01 修改

阅读量145

点赞数

分类专栏：大数据文章标签：数据库大数据排序算法

于 2023-07-28 11:15:52 首次发布

本文链接：https://blog.csdn.net/Fread325/article/details/131974938

版权

大数据专栏收录该内容

14 篇文章 0 订阅

订阅专栏

1.需求简略描述

需要将一个日切片表数据转换为拉链数据，其中remark字段数值变化无规律且可能与历史值相同，相邻日分区相同的remark值整合到同一拉链区间，拉链数据end_dt='29991231'为开链区间

id	remark	stat_dt	id	remark	start_dt	end_dt
1	a	20230701	1	a	20230701	20230703
1	a	20230702	1	b	20230703	20230706
1	b	20230703	1	a	20230706	20230707
1	b	20230704	1	b	20230707	20230708
1	b	20230705	1	a	20230708	29991231
1	a	20230706	2	a	20230701	20230706
1	b	20230707	2	b	20230706	20230707
1	a	20230708	2	a	20230707	29991231
1	a	20230709
2	a	20230701
2	a	20230702
2	a	20230703
2	a	20230704
2	a	20230705
2	b	20230706
2	a	20230707
2	a	20230708
2	a	20230709

2.问题

若直接按照id,remark进行group by 取stat_dt最大最小值，后续处理略（参考4.原需求完整代码），产生的结果会出现区间重叠

id	remark	start_dt	end_dt
1	a	20230701	20230709
1	b	20230703	29991231
2	a	20230701	20230709
2	b	20230706	29991231

3.处理方案

分别以id和id+remark分组排序获取rn_id和rn_col，再通过rn_id和rn_col差值获取分组并列排序值rn，可得相邻日分区remark值相同时rn相等，相邻日分区remark值不同时rn跳跃递增，在按照id,remark,rn进行group by 处理，后续处理略（参考4.原需求完整代码）

id	remark	stat_dt	rn_id	rn_rmk	rn
1	a	20230701	1	1	0
1	a	20230702	2	2	0
1	b	20230703	3	1	2
1	b	20230704	4	2	2
1	b	20230705	5	3	2
1	a	20230706	6	1	5
1	b	20230707	7	1	6
1	a	20230708	8	1	7
1	a	20230709	9	2	7
2	a	20230701	1	1	0
2	a	20230702	2	2	0
2	a	20230703	3	3	0
2	a	20230704	4	4	0
2	a	20230705	5	5	0
2	b	20230706	6	1	5
2	a	20230707	7	1	6
2	a	20230708	8	2	6
2	a	20230709	9	3	6

4.原需求完整代码

-- # 任务名称：日切表转拉链表数据初始化
-- # 需求编号：
-- # 任务功能说明：
-- # task_id 为业务系统跑批id,gmt_updated和task_id变化不进行闭链开链
-- # ------------------------------------------------------------------------------------------------
-- # 目标表：dwd_table_name_dz
-- # 源表：
-- # 01. ods_table_name_df
-- # ------------------------------------------------------------------------------------------------
-- # 创建人：Lemon.Warn 创建时间：2023.07.28
-- # 修改 - 01 ：
-- #     修改人:    修改时间:
-- #     需求编号：
-- #     修改内容:
-- # ------------------------------------------------------------------------------------------------


-- 清空拉链表
TRUNCATE TABLE ods_db.dwd_table_name_dz;

-- 清空临时表
DROP TABLE IF EXISTS tmp_table_name_001;
DROP TABLE IF EXISTS tmp_table_name_002;
DROP TABLE IF EXISTS tmp_table_name_003;
DROP TABLE IF EXISTS tmp_table_name_004;
DROP TABLE IF EXISTS tmp_table_name_005;

-- remark内容多次改动，当前值可能与历史值重复，按id和多字段分组排序，方便相同字段值不同时间段的切割
CREATE TABLE tmp_table_name_001 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       task_id,
       extension,
       gmt_created,
       gmt_creator,
       gmt_updated,
       gmt_updater,
       deleted,
       coalesce(stat_dt_s, stat_dt)                                                                   as stat_dt,
       ROW_NUMBER() OVER (PARTITION BY id ORDER BY gmt_updated ASC, coalesce(stat_dt_s, stat_dt) ASC) as rn_id,
       ROW_NUMBER() OVER (
           PARTITION BY
               id,
               tenant_id,
               tenant,
               member_user_id,
               external_user_id,
               description,
               remark,
               create_time,
               remark_corp_name,
               remark_mobiles,
               add_way,
               oper_userid,
               state,
               extension,
               gmt_created,
               gmt_creator,
               deleted
           ORDER BY gmt_updated ASC, coalesce(stat_dt_s, stat_dt) ASC)                                as rn_col
FROM ods_db.ods_table_name_df
;

-- 相同字段值不同时间段的切割
CREATE TABLE tmp_table_name_002 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       task_id,
       extension,
       gmt_created,
       gmt_creator,
       gmt_updated,
       gmt_updater,
       deleted,
       stat_dt,
       rn_id,
       rn_col,
       rn_id - rn_col as rn
FROM tmp_table_name_001
;

-- 去重，获取最最大最小值
CREATE TABLE tmp_table_name_003 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       max(task_id)     as task_id,         -- task_id取最新，即取最大
       extension,
       gmt_created,
       gmt_creator,
       max(gmt_updated) as gmt_updated,     -- gmt_updated取最新，即取最大
       max(gmt_updater) as gmt_updater,
       deleted,
       max(stat_dt)     as stat_dt,
       min(gmt_updated) as min_gmt_updated, --  取最早更新时间
       min(stat_dt)     as min_stat_dt      --  取最早更新时间
FROM tmp_table_name_002
GROUP BY id, tenant_id, tenant, member_user_id, external_user_id, description
       , remark, create_time, remark_corp_name, remark_mobiles, add_way, oper_userid
       , state, extension, gmt_created, gmt_creator, deleted, rn;

-- 去重，获取最最大最小值
CREATE TABLE tmp_table_name_004 stored as parquet AS
SELECT id                    as id,
       max(tenant_id)        as tenant_id,
       max(tenant)           as tenant,
       max(member_user_id)   as member_user_id,
       max(external_user_id) as external_user_id,
       max(description)      as description,
       max(remark)           as remark,
       max(create_time)      as create_time,
       max(remark_corp_name) as remark_corp_name,
       max(remark_mobiles)   as remark_mobiles,
       max(add_way)          as add_way,
       max(oper_userid)      as oper_userid,
       max(state)            as state,
       max(task_id)          as task_id,
       max(extension)        as extension,
       case
           when min(gmt_created) > gmt_updated then gmt_updated
           else min(gmt_created)
           end               as gmt_created, -- 如果来源表中存在 gmt_created>gmt_updated的情况，需要做gmt_created=gmt_updated 前置处理
       max(gmt_creator)      as gmt_creator,
       gmt_updated           as gmt_updated,
       max(gmt_updater)      as gmt_updater,
       max(deleted)          as deleted,
       max(stat_dt)          as stat_dt,
       min(min_gmt_updated)  as min_gmt_updated,
       min(min_stat_dt)      as min_stat_dt
FROM tmp_table_name_003
GROUP BY id, gmt_updated;

-- 分组排序
CREATE TABLE tmp_table_name_005 stored as parquet AS
SELECT id,
       tenant_id,
       tenant,
       member_user_id,
       external_user_id,
       description,
       remark,
       create_time,
       remark_corp_name,
       remark_mobiles,
       add_way,
       oper_userid,
       state,
       task_id,
       extension,
       min(gmt_created) over (partition by id)                      as gmt_created, -- 统一为最小创建时间，避免多条记录创建时间不一致
       gmt_creator,
       gmt_updated,
       gmt_updater,
       deleted,
       stat_dt,
       min_gmt_updated,
       min_stat_dt,
       ROW_NUMBER() OVER (PARTITION BY id ORDER BY gmt_updated ASC) as rn,
       count(1) OVER (PARTITION BY id )                             as max_rn       -- 当前分组最大序号
FROM tmp_table_name_004;

-- 开链数据 及 被删除的闭链数据
INSERT OVERWRITE TABLE ods_db.dwd_table_name_dz partition (end_dt)
SELECT id                                                                   as id,
       tenant_id                                                            as tenant_id,
       tenant                                                               as tenant,
       member_user_id                                                       as member_user_id,
       external_user_id                                                     as external_user_id,
       description                                                          as description,
       remark                                                               as remark,
       create_time                                                          as create_time,
       remark_corp_name                                                     as remark_corp_name,
       remark_mobiles                                                       as remark_mobiles,
       add_way                                                              as add_way,
       oper_userid                                                          as oper_userid,
       state                                                                as state,
       task_id                                                              as task_id,
       extension                                                            as extension,
       gmt_created                                                          as gmt_created,
       gmt_creator                                                          as gmt_creator,
       gmt_updated                                                          as gmt_updated,
       gmt_updater                                                          as gmt_updater,
       deleted                                                              as deleted,
       CASE
           WHEN REGEXP_REPLACE(SUBSTR(min_gmt_updated, 1, 10), '-', '') > min_stat_dt then min_stat_dt
           ELSE REGEXP_REPLACE(SUBSTR(min_gmt_updated, 1, 10), '-', '') END as start_dt, -- case when规避 bizdate+1 的凌晨那部分数据；min_字段获取最早更新时间
       CASE
           WHEN stat_dt < '${bizdate}' THEN regexp_replace(
                   cast(date_add(concat(substr(stat_dt, 1, 4), '-', substr(stat_dt, 5, 2), '-',
                                        substr(stat_dt, 7, 2)),
                                 1) as string), '-', '')
           ELSE '29991231' END                                              as end_dt    -- 如果为闭链数据 end_dt=最新stat_dt+1（不然与正常拉链处理逻辑不符）
FROM tmp_table_name_005 t1
WHERE rn = max_rn;
-- 取最新一条

-- 闭链数据
INSERT INTO TABLE ods_db.dwd_table_name_dz partition (end_dt)
SELECT t1.id                                                                   as id,
       t1.tenant_id                                                            as tenant_id,
       t1.tenant                                                               as tenant,
       t1.member_user_id                                                       as member_user_id,
       t1.external_user_id                                                     as external_user_id,
       t1.description                                                          as description,
       t1.remark                                                               as remark,
       t1.create_time                                                          as create_time,
       t1.remark_corp_name                                                     as remark_corp_name,
       t1.remark_mobiles                                                       as remark_mobiles,
       t1.add_way                                                              as add_way,
       t1.oper_userid                                                          as oper_userid,
       t1.state                                                                as state,
       t1.task_id                                                              as task_id,
       t1.extension                                                            as extension,
       t1.gmt_created                                                          as gmt_created,
       t1.gmt_creator                                                          as gmt_creator,
       t1.gmt_updated                                                          as gmt_updated,
       t1.gmt_updater                                                          as gmt_updater,
       t1.deleted                                                              as deleted,
       CASE
           WHEN REGEXP_REPLACE(SUBSTR(t1.min_gmt_updated, 1, 10), '-', '') > t1.min_stat_dt THEN t1.min_stat_dt
           ELSE REGEXP_REPLACE(SUBSTR(t1.min_gmt_updated, 1, 10), '-', '') END as start_dt, -- case when规避 bizdate+1 的凌晨那部分数据；min_字段获取最早更新时间
       CASE
           WHEN REGEXP_REPLACE(SUBSTR(t2.min_gmt_updated, 1, 10), '-', '') > t2.min_stat_dt THEN t2.min_stat_dt
           ELSE REGEXP_REPLACE(SUBSTR(t2.min_gmt_updated, 1, 10), '-', '') END as end_dt    -- case when规避 bizdate+1 的凌晨那部分数据；min_字段获取最早更新时间
FROM tmp_table_name_005 t1
         JOIN tmp_table_name_005 t2 ON t1.id = t2.id AND t1.rn = t2.rn - 1;

-- 清空临时表  --
DROP TABLE IF EXISTS tmp_table_name_001;
DROP TABLE IF EXISTS tmp_table_name_002;
DROP TABLE IF EXISTS tmp_table_name_003;
DROP TABLE IF EXISTS tmp_table_name_004;
DROP TABLE IF EXISTS tmp_table_name_005;