生成拉链表思路分析
ods层
show databases;
create database test_zip;
-- ods
drop table if exists test_zip.ods_user_info;
create table test_zip.ods_user_info
(
id string comment 'id',
name string comment '姓名'
) comment '用户信息表'
partitioned by (load_date string comment '分区字段');
-- 装载语句
-- 将以下数据插入 ods_user_info
insert into table test_zip.ods_user_info values ("1", "张三", "2020-06-14");
insert into table test_zip.ods_user_info values ("2", "李四", "2020-06-14");
insert into table test_zip.ods_user_info values ("3", "王五", "2020-06-14");
insert into table test_zip.ods_user_info values ("4", "赵六", "2020-06-14");
insert into table test_zip.ods_user_info values ("5", "小明", "2020-06-14");
insert into table test_zip.ods_user_info values ("6", "小华", "2020-06-14");
insert into table test_zip.ods_user_info values ("7", "小强", "2020-06-14");
-- 第二天的新增和变化数据
insert into table test_zip.ods_user_info values ("6", "大华", "2020-06-15");
insert into table test_zip.ods_user_info values ("7", "大强", "2020-06-15");
insert into table test_zip.ods_user_info values ("8", "马克", "2020-06-15");
insert into table test_zip.ods_user_info values ("9", "威廉", "2020-06-15");
这里直接给 ods 层插入了4天的数据
dim层 初始化拉链表
drop table if exists dim_user_info_zip;
create table if not exists dim_user_info_zip
(
id string comment 'id',
name string comment '姓名',
start_date string comment '开始日期',
end_date string comment '日期'
) comment '用户拉链表' partitioned by (dt string comment '分区字段');
首日装载
-- 首日装载(全量数据载) 将 2020-06-14 作为数仓上线第一天
insert overwrite table dim_user_info_zip partition (dt)
select id,
name,
'2020-06-14' as start_date,
'9999-12-31' as end_date,
'9999-12-31' as dt
from ods_user_info
where dt = '2020-06-14';
select * from dim_user_info_zip;
每日装载(15号)
-- todo 2020-06-15
-- 每日装载
-- 每日装载 获取第二天新增和变化的数据
with tmp as (
select old.id as old_id,
old.name as old_name,
old.start_date as old_start_date,
old.end_date as old_end_date,
new.id as new_id,
new.name as new_name,
new.start_date as new_start_date,
new.end_date as new_end_date
from (
select id,
name,
start_date,
end_date
from dim_user_info_zip
where dt = '9999-12-31'
) old
full outer join
( -- 获取第二天新增和变化的数据
select id,
name,
'2020-06-15' as start_date,
'9999-12-31' as end_date
from ods_user_info
where dt = '2020-06-15'
) new
on old.id = new.id
)
insert overwrite table dim_user_info_zip partition (dt)
select if(new_id is not null, new_id, old_id) id,
if(new_id is not null, new_name, old_name) name,
if(new_id is not null, new_start_date, old_start_date) start_date,
if(new_id is not null, new_end_date, old_end_date) end_date,
if(new_id is not null, new_end_date, old_end_date) dt
from tmp
union all
select old_id as id,
old_name as name,
old_start_date as start_date,
cast(date_sub('2020-06-15', 1) as string) as end_date,
cast(date_sub('2020-06-15', 1) as string) as dt
from tmp
where old_id is not null
and new_id is not null;
select * from dim_user_info_zip;
select id,
name,
start_date,
'9999-12-31' as end_date,
'9999-12-31' as dt
from dim_user_info_zip
where end_date >= '2020-06-15'
and start_date <= '2020-06-15';
-- 15号也是当天最新数据
select *
from dim_user_info_zip
where end_date='9999-12-31';
每日装载(16号)
-- todo 2020-06-16
-- 每日装载
-- 每日装载 获取第二天新增和变化的数据
with tmp as (
select old.id as old_id,
old.name as old_name,
old.start_date as old_start_date,
old.end_date as old_end_date,
new.id as new_id,
new.name as new_name,
new.start_date as new_start_date,
new.end_date as new_end_date
from (
select id,
name,
start_date,
end_date
from dim_user_info_zip
where dt = '9999-12-31'
) old
full outer join
( -- 获取第二天新增和变化的数据
select id,
name,
'2020-06-16' as start_date,
'9999-12-31' as end_date
from ods_user_info
where dt = '2020-06-16'
) new
on old.id = new.id
)
insert overwrite table dim_user_info_zip partition (dt)
select if(new_id is not null, new_id, old_id) id,
if(new_id is not null, new_name, old_name) name,
if(new_id is not null, new_start_date, old_start_date) start_date,
if(new_id is not null, new_end_date, old_end_date) end_date,
if(new_id is not null, new_end_date, old_end_date) dt
from tmp
union all
select old_id as id,
old_name as name,
old_start_date as start_date,
cast(date_sub('2020-06-16', 1) as string) as end_date,
cast(date_sub('2020-06-16', 1) as string) as dt
from tmp
where old_id is not null
and new_id is not null;
select * from dim_user_info_zip;
select id,
name,
start_date,
'9999-12-31' as end_date,
'9999-12-31' as dt
from dim_user_info_zip
where end_date >= '2020-06-16'
and start_date <= '2020-06-16';
-- 16号也是当天最新数据
select *
from dim_user_info_zip
where end_date='9999-12-31';
每日装载(17号)
-- todo 2020-06-17
with tmp as (
select old.id as old_id,
old.name as old_name,
old.start_date as old_start_date,
old.end_date as old_end_date,
new.id as new_id,
new.name as new_name,
new.start_date as new_start_date,
new.end_date as new_end_date
from (
select id,
name,
start_date,
end_date
from dim_user_info_zip
where dt = '9999-12-31'
) old
full outer join
( -- 获取第二天新增和变化的数据
select id,
name,
'2020-06-17' as start_date,
'9999-12-31' as end_date
from ods_user_info
where dt = '2020-06-17'
) new
on old.id = new.id
)
insert overwrite table dim_user_info_zip partition (dt)
select if(new_id is not null, new_id, old_id) id,
if(new_id is not null, new_name, old_name) name,
if(new_id is not null, new_start_date, old_start_date) start_date,
if(new_id is not null, new_end_date, old_end_date) end_date,
if(new_id is not null, new_end_date, old_end_date) dt
from tmp
union all
select old_id as id,
old_name as name,
old_start_date as start_date,
cast(date_sub('2020-06-17', 1) as string) as end_date,
cast(date_sub('2020-06-17', 1) as string) as dt
from tmp
where old_id is not null
and new_id is not null;
select * from dim_user_info_zip;
select id,
name,
start_date,
'9999-12-31' as end_date,
'9999-12-31' as dt
from dim_user_info_zip
where end_date >= '2020-06-16'
and start_date <= '2020-06-16';
-- 17号也是当天最新数据
select *
from dim_user_info_zip
where end_date='9999-12-31';
拉链表回滚
假设2020-06-16号的数据出错了,要进行回滚,要回滚到2020-06-15,然后重新跑16号、17号的数据
1、重修跑16、17号数据,前提是,16号、17号的数据已经修复正确
2、注意备份数据
-- 拉链表回滚
-- 1、回滚到2020-06-15
insert overwrite table dim_user_info_zip partition (dt)
select id,
name,
start_date,
'9999-12-31' as end_date, -- 将15号,闭链的数据,重新开链,即《威廉》
'9999-12-31' as dt -- 最新分区
from dim_user_info_zip
where end_date >= '2020-06-15'
and start_date <= '2020-06-15';
-- 2、重新跑16号、17号的数据
-- 2.1 跑一下 每日装载(16号)
-- 2.2 跑一下 每日装载(17号)