-- 模拟第一次抽取 odsinsert overwrite table tmp.temp_ods_user partition(dt='2023-05-01')select9527as id
,114as user_id
,'张三'as user_name
,'1988-09-08'as date_of_birth
,'北京市朝阳区'as address_of_birth
,'2023-01-01 10:00:00'as update_time
;-- 模拟第一次抽取 拉链 insert overwrite table tmp.temp_dw_user_chain partition(status='expired',end_date='2023-05-01')selectcasewhen h.change_code<>c.change_code then h.start_date else e.start_date endas start_date
,casewhen h.change_code<>c.change_code then h.change_code else e.change_code endas change_code
,casewhen h.change_code<>c.change_code then h.id else e.id endas id
,casewhen h.change_code<>c.change_code then h.user_id else e.user_id endas user_id
,casewhen h.change_code<>c.change_code then h.user_name else e.user_name endas user_name
,casewhen h.change_code<>c.change_code then h.date_of_birth else e.date_of_birth endas date_of_birth
,casewhen h.change_code<>c.change_code then h.address_of_birth else e.address_of_birth endas address_of_birth
,casewhen h.change_code<>c.change_code then h.update_time else e.update_time endas update_time
from(select*from tmp.temp_dw_user_chain
wherestatus='active'and id isnotnull) h -- 上次的active数据fulljoin(select`(dt|rank)?+.+`from(select id,user_id,user_name,date_of_birth,address_of_birth,update_time,change_code
,row_number(id)as rank
from(select*,md5(concat_ws('_',id,user_id,user_name,date_of_birth,address_of_birth,update_time))as change_code
from tmp.temp_ods_user
where dt ='2023-05-01'and id isnotnull
distribute by id sort by id desc) x
) t
where t.rank =1) c -- 抽取的增量数据on h.id = c.id
fulljoin(select*from tmp.temp_dw_user_chain
wherestatus='expired'and end_date='2023-05-01') e -- 过期数据on e.id = c.id
where h.id isnotnulland c.id isnotnulland(( h.change_code <> c.change_code )or( h.change_code = c.change_code and e.id isnotnull));insert overwrite table tmp.temp_dw_user_chain partition(status='active',end_date='9999-12-31')selectif(h.id isnullor(c.id isnotnulland(h.change_code <> c.change_code)),'2023-05-01',h.start_date)as start_date
,casewhen h.id isnullthen
c.change_code
when h.id isnotnulland c.id isnotnulland h.change_code <> c.change_code then c.change_code
else
h.change_code
endas change_code
,casewhen c.id isnotnullthen c.id else h.id endas id
,casewhen c.id isnotnullthen c.user_id else h.user_id endas user_id
,casewhen c.id isnotnullthen c.user_name else h.user_name endas user_name
,casewhen c.id isnotnullthen c.date_of_birth else h.date_of_birth endas date_of_birth
,casewhen c.id isnotnullthen c.address_of_birth else h.address_of_birth endas address_of_birth
,casewhen c.id isnotnullthen c.update_time else h.update_time endas update_time
from(select*from tmp.temp_dw_user_chain
wherestatus='active'and id isnotnull) h -- 上次的active数据fulljoin(select`(dt|rank)?+.+`from(select id,user_id,user_name,date_of_birth,address_of_birth,update_time,change_code
,row_number(id)as rank
from(select*,md5(concat_ws('_',id,user_id,user_name,date_of_birth,address_of_birth,update_time))as change_code
from tmp.temp_ods_user
where dt ='2023-05-01'and id isnotnull
distribute by id sort by id desc) x
) t
where t.rank =1) c -- 抽取的增量数据on h.id = c.id
;-- 拉链任务可能在一天内手工跑多次,当天第一次跑拉链任务时,EXPIRED分区中是没有数据的,此时会将被更新的旧数据写入EXPIRED分区中。当天第二次手工重跑拉链任务时,EXPIRED分区中已有数据,会直接将EXPIRED分区数据写入EXPIRED分区。-- 拉链SQL中EXPIRED分区是必须使用的。拉链任务当天第二次重跑时ACTIVE分区数据已经更新,不是昨天的状态,不使用EXPIRED分区中已有的数据会清空EXPIRED分区数据。
查看dw拉链之后的数据
start_date
change_code
id
user_id
user_name
date_of_birth
address_of_birth
update_time
status
end_date
2023-05-01
4ac4beee336ffcc0c6afaab74ed6405f
9527
114
张三
1988-09-08
北京市朝阳区
2023-01-01 10:00:00
active
9999-12-31
step3:2023-05-02 第一次变更后抽取
insert overwrite table tmp.temp_ods_user partition (dt='2023-05-02')
select 9527 as id
,114 as user_id
,'张三' as user_name
,'1992-09-08' as date_of_birth
,'北京市朝阳区' as address_of_birth
,'2023-05-02 10:00:00' as update_time
;
-- 模拟第一次变更抽取 拉链
insert overwrite table tmp.temp_dw_user_chain partition (status='expired',end_date='2023-05-02')
select
case when h.change_code<>c.change_code then h.start_date else e.start_date end as start_date
,case when h.change_code<>c.change_code then h.change_code else e.change_code end as change_code
,case when h.change_code<>c.change_code then h.id else e.id end as id
,case when h.change_code<>c.change_code then h.user_id else e.user_id end as user_id
,case when h.change_code<>c.change_code then h.user_name else e.user_name end as user_name
,case when h.change_code<>c.change_code then h.date_of_birth else e.date_of_birth end as date_of_birth
,case when h.change_code<>c.change_code then h.address_of_birth else e.address_of_birth end as address_of_birth
,case when h.change_code<>c.change_code then h.update_time else e.update_time end as update_time
from(select *
from tmp.temp_dw_user_chain
where status = 'active'
and id is not null
) h -- 上次的active数据
full join(select `(dt|rank)?+.+`
from (select id,user_id,user_name,date_of_birth,address_of_birth,update_time,change_code
,row_number(id) as rank
from (select *,md5(concat_ws('_',id,user_id,user_name,date_of_birth,address_of_birth,update_time)) as change_code
from tmp.temp_ods_user
where dt = '2023-05-02'
and id is not null
distribute by id sort by id desc
) x
) t
where t.rank = 1
) c -- 抽取的增量数据
on h.id = c.id
full join(select *
from tmp.temp_dw_user_chain
where status='expired'
and end_date='2023-05-02'
) e -- 过期数据
on e.id = c.id
where h.id is not null and c.id is not null and (( h.change_code <> c.change_code ) or ( h.change_code = c.change_code and e.id is not null))
;
insert overwrite table tmp.temp_dw_user_chain partition (status='active',end_date='9999-12-31')
select if(h.id is null or (c.id is not null and (h.change_code <> c.change_code)),'2023-05-02',h.start_date) as start_date
,case
when h.id is null then
c.change_code
when h.id is not null and c.id is not null and h.change_code <> c.change_code then c.change_code
else
h.change_code
end as change_code
,case when c.id is not null then c.id else h.id end as id
,case when c.id is not null then c.user_id else h.user_id end as user_id
,case when c.id is not null then c.user_name else h.user_name end as user_name
,case when c.id is not null then c.date_of_birth else h.date_of_birth end as date_of_birth
,case when c.id is not null then c.address_of_birth else h.address_of_birth end as address_of_birth
,case when c.id is not null then c.update_time else h.update_time end as update_time
from(select *
from tmp.temp_dw_user_chain
where status = 'active'
and id is not null
) h -- 上次的active数据
full join(select `(dt|rank)?+.+`
from (select id,user_id,user_name,date_of_birth,address_of_birth,update_time,change_code
,row_number(id) as rank
from (select *
,md5(concat_ws('_',id,user_id,user_name,date_of_birth,address_of_birth,update_time)) as change_code
from tmp.temp_ods_user
where dt = '2023-05-02'
and id is not null
distribute by id sort by id desc
) x
) t
where t.rank = 1
) c -- 抽取的增量数据
on h.id = c.id
;
查看dw拉链之后的数据
start_date
change_code
id
user_id
user_name
date_of_birth
address_of_birth
update_time
status
end_date
2023-05-02
ee3915fc4f4ecad9ea1570e391b4e
9527
114
张三
1992-09-08
北京市朝阳区
2023-05-02 10:00:00
active
9999-12-31
2023-05-01
4ac4beee336ffcc0c6afaab74ed6405f
9527
114
张三
1988-09-08
北京市朝阳区
2023-01-01 10:00:00
expired
2023-05-02
step4:2023-05-03 第二次变更后抽取
insert overwrite table tmp.temp_ods_user partition (dt='2023-05-03')
select 9527 as id
,114 as user_id
,'张三' as user_name
,'1992-09-08' as date_of_birth
,'北京市海淀区' as address_of_birth
,'2023-05-03 10:00:00' as update_time
;
-- 模拟第二次变更抽取 拉链
insert overwrite table tmp.temp_dw_user_chain partition (status='expired',end_date='2023-05-03')
select
case when h.change_code<>c.change_code then h.start_date else e.start_date end as start_date
,case when h.change_code<>c.change_code then h.change_code else e.change_code end as change_code
,case when h.change_code<>c.change_code then h.id else e.id end as id
,case when h.change_code<>c.change_code then h.user_id else e.user_id end as user_id
,case when h.change_code<>c.change_code then h.user_name else e.user_name end as user_name
,case when h.change_code<>c.change_code then h.date_of_birth else e.date_of_birth end as date_of_birth
,case when h.change_code<>c.change_code then h.address_of_birth else e.address_of_birth end as address_of_birth
,case when h.change_code<>c.change_code then h.update_time else e.update_time end as update_time
from(select *
from tmp.temp_dw_user_chain
where status = 'active'
and id is not null
) h -- 上次的active数据
full join(select `(dt|rank)?+.+`
from (select id,user_id,user_name,date_of_birth,address_of_birth,update_time,change_code
,row_number(id) as rank
from (select *,md5(concat_ws('_',id,user_id,user_name,date_of_birth,address_of_birth,update_time)) as change_code
from tmp.temp_ods_user
where dt = '2023-05-03'
and id is not null
distribute by id sort by id desc
) x
) t
where t.rank = 1
) c -- 抽取的增量数据
on h.id = c.id
full join(select *
from tmp.temp_dw_user_chain
where status='expired'
and end_date='2023-05-03'
) e -- 过期数据
on e.id = c.id
where h.id is not null and c.id is not null and (( h.change_code <> c.change_code ) or ( h.change_code = c.change_code and e.id is not null))
;
insert overwrite table tmp.temp_dw_user_chain partition (status='active',end_date='9999-12-31')
select if(h.id is null or (c.id is not null and (h.change_code <> c.change_code)),'2023-05-03',h.start_date) as start_date
,case
when h.id is null then
c.change_code
when h.id is not null and c.id is not null and h.change_code <> c.change_code then c.change_code
else
h.change_code
end as change_code
,case when c.id is not null then c.id else h.id end as id
,case when c.id is not null then c.user_id else h.user_id end as user_id
,case when c.id is not null then c.user_name else h.user_name end as user_name
,case when c.id is not null then c.date_of_birth else h.date_of_birth end as date_of_birth
,case when c.id is not null then c.address_of_birth else h.address_of_birth end as address_of_birth
,case when c.id is not null then c.update_time else h.update_time end as update_time
from(select *
from tmp.temp_dw_user_chain
where status = 'active'
and id is not null
) h -- 上次的active数据
full join(select `(dt|rank)?+.+`
from (select id,user_id,user_name,date_of_birth,address_of_birth,update_time,change_code
,row_number(id) as rank
from (select *
,md5(concat_ws('_',id,user_id,user_name,date_of_birth,address_of_birth,update_time)) as change_code
from tmp.temp_ods_user
where dt = '2023-05-03'
and id is not null
distribute by id sort by id desc
) x
) t
where t.rank = 1
) c -- 抽取的增量数据
on h.id = c.id
;