1.创建业务表并初始数据
drop table if exists mall_user;
create table mall_user
(
uid bigint unsigned auto_increment comment '用户唯一ID' primary key,
login_name varchar(32) not null comment '用户登录名',
nick_name varchar(32) not null default '请设置昵称' comment '用户昵称',
remark varchar(255) not null default '' comment '备注',
create_time timestamp default CURRENT_TIMESTAMP null comment '创建时间',
update_time timestamp default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP comment '更新数据',
is_del tinyint not null default '0' comment '删除标记 0-正常 1-已删除'
) comment '用户信息业务表' default character set utf8mb4;
insert into mall_user(uid, login_name, nick_name, remark, create_time, update_time)
values
(1001, 'loginName001','nickName001', '初始数据', date_sub(now(), interval 2 day), date_sub(now(), interval 2 day)),
(1002, 'loginName002','nickName002', '初始数据', date_sub(now(), interval 3 day), date_sub(now(), interval 3 day)),
(1003, 'loginName003','nickName003', '初始数据', date_sub(now(), interval 4 day), date_sub(now(), interval 4 day)),
(1004, 'loginName004','nickName004', '初始数据', date_sub(now(), interval 5 day), date_sub(now(), interval 5 day));
update mall_user set is_del=1,update_time=date_sub(now(), interval 2 day) where uid=1002;
2.假设业务表初始同步到ods表,插入拉链表,增加记录开始时间和结束时间
create table dim_mall_user_zip
(
uid bigint unsigned auto_increment comment '用户唯一ID' primary key,
login_name varchar(32) not null comment '用户登录名',
nick_name varchar(32) not null default '请设置昵称' comment '用户昵称',
remark varchar(255) not null default '' comment '备注',
create_time timestamp default CURRENT_TIMESTAMP null comment '创建时间',
update_time timestamp default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP comment '更新数据',
is_del tinyint not null default '0' comment '删除标记 0-正常 1-已删除',
start_date date comment '生效时间',
end_date date comment '结束时间'
) comment '用户信息业务表-拉链表' default character set utf8mb4;
2.1首次装载数据-初始化拉链表-有效数据
replace into dim_mall_user_zip(uid, login_name, nick_name, remark, create_time, update_time, is_del, start_date, end_date)
select uid, login_name, nick_name, remark, create_time, update_time, is_del,
date_format(create_time, '%Y-%m-%d') start_date,
'9999-12-31' end_date
from mall_user
where is_del=0;
2.2首次装载数据-初始化拉链表-新建一个分区放置过期数据
create table dim_mall_user_zip_expire_20220305 like dim_mall_user_zip;
replace into dim_mall_user_zip(uid, login_name, nick_name, remark, create_time, update_time, is_del, start_date, end_date)
select uid, login_name, nick_name, remark, create_time, update_time, is_del,
date_format(create_time, '%Y-%m-%d') start_date,
date_format(update_time, '%Y-%m-%d') end_date
from mall_user
where is_del=1;
2.3此时全部的数据(hive里面的多个分区)
select * from dim_mall_user_zip dmuz
union
select * from dim_mall_user_zip_expire_20220305 dmuze
order by uid
3假设第二天,原业务表增加1条记录,修改1条记录,逻辑删除1条记录
insert into mall_user(uid, login_name, nick_name, remark, create_time, update_time)
values
(1005, 'loginName005','nickName005', '每日新增数据', date_sub(now(), interval 1 day), date_sub(now(), interval 1 day));
update mall_user set nick_name='nickName033', remark ='每日修改数据' ,
create_time=date_sub(now(), interval 1 day), update_time=date_sub(now(), interval 1 day)
where uid=1003;
update mall_user set is_del =1, remark ='每日逻辑删除数据',
create_time=date_sub(now(), interval 1 day), update_time=date_sub(now(), interval 1 day)
where uid=1001;
3.1业务库表当前数据
3.2时间来到第三天,需要获取昨天变更的数据到当前ods表,1条新增+1条更新+1条逻辑删除
select * from mall_user
where (
create_time >= date_sub(curdate(), interval 1 day) and create_time < date_sub(curdate(), interval 0 day)
or
update_time >= date_sub(curdate(), interval 1 day) and update_time > date_sub(curdate(), interval 0 day)
);
变化的数据如下
将其抽到每日新增ods表(分区)
create table ods_mall_user_20220306
(
uid bigint unsigned auto_increment comment '用户唯一ID' primary key,
login_name varchar(32) not null comment '用户登录名',
nick_name varchar(32) not null default '请设置昵称' comment '用户昵称',
remark varchar(255) not null default '' comment '备注',
create_time timestamp default CURRENT_TIMESTAMP null comment '创建时间',
update_time timestamp default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP comment '更新数据',
is_del tinyint not null default '0' comment '删除标记 0-正常 1-已删除'
) comment '用户信息业务表-每日变更ods表' default character set utf8mb4;
replace into ods_mall_user_20220306
select * from mall_user
where (
create_time >= date_sub(curdate(), interval 1 day) and create_time < date_sub(curdate(), interval 0 day)
or
update_time >= date_sub(curdate(), interval 1 day) and update_time > date_sub(curdate(), interval 0 day)
);
4.合并全量最新数据到拉链表
昨日全量最新和今天变化的数据对比
select * from dim_mall_user_zip t1
select
uid,
login_name ,
nick_name ,
remark ,
create_time ,
update_time ,
is_del ,
curdate() start_date ,
'9999-12-31' end_date
from ods_mall_user_20220306
分析
不变的部分1004
修改的部分1001(标记删除),1003(昵称修改),原来的数据需要将结束时间改为当天,并插入新的1003数据
新增的部分1005
开链表上应该有1003、1004、1005记录
因为在mysql8上,不支持fulljoin,用union实现全外连接,左表还不能指明列名?!
select
*
from dim_mall_user_zip_bak zip right join
(select
uid new_uid,
login_name new_login_name,
nick_name new_nick_name,
remark new_remark,
create_time new_create_time,
update_time new_update_time,
is_del new_is_del,
curdate() new_start_date,
'9999-12-31' new_end_date
from ods_mall_user_20220306 where is_del=0) ods on zip.uid=ods.new_uid
union
select * from dim_mall_user_zip_bak zip left join
(select
uid,
login_name ,
nick_name ,
remark ,
create_time ,
update_time ,
is_del ,
curdate() start_date ,
'9999-12-31' end_date
from ods_mall_user_20220306 where is_del=0) ods on zip.uid=ods.uid
全外连接后和execl里面是一样的
1001 loginName001 nickName001 初始数据 2022-03-04 13:32:20.0 2022-03-04 13:32:20.0 0 2022-03-04 9999-12-31 1001 loginName001 nickName001 每日逻辑删除数据 2022-03-05 11:24:12.0 2022-03-05 11:24:12.0 1 2022-03-06 9999-12-31
1003 loginName003 nickName002 初始数据 2022-03-02 13:32:20.0 2022-03-02 13:32:20.0 0 2022-03-02 9999-12-31 1003 loginName003 nickName033 每日修改数据 2022-03-05 11:24:09.0 2022-03-05 11:24:09.0 0 2022-03-06 9999-12-31
1004 loginName004 nickName004 每日新增数据 2022-03-05 11:24:06.0 2022-03-05 11:24:06.0 0 2022-03-06 9999-12-31
1005 loginName005 nickName005 每日新增数据 2022-03-05 11:24:06.0 2022-03-05 11:24:06.0 0 2022-03-06 9999-12-31
4.1模拟hive覆盖全新数据的拉链表
create table dim_mall_user_zip_bak like dim_mall_user_zip;
insert into dim_mall_user_zip_bak select * from dim_mall_user_zip; -- 此时只有初始化有效的2条数据
truncate table dim_mall_user_zip;
mysql还得套一层,不然过滤会有问题
insert into dim_mall_user_zip
select * from (
select
if(new_uid is not null, new_uid, uid) uid,
if(new_login_name is not null, new_login_name, login_name) login_name,
if(new_nick_name is not null, new_nick_name, nick_name) nick_name,
if(new_remark is not null, new_remark, remark) remark,
if(new_create_time is not null, new_create_time, create_time) create_time,
if(new_update_time is not null, new_update_time, update_time) update_time,
if(new_is_del is not null, new_is_del, is_del) is_del,
if(new_start_date is not null, new_start_date, start_date) start_date,
if(new_end_date is not null, new_end_date, end_date) end_date
from (
select
*
from dim_mall_user_zip_bak zip right join
(select
uid new_uid,
login_name new_login_name,
nick_name new_nick_name,
remark new_remark,
create_time new_create_time,
update_time new_update_time,
is_del new_is_del,
curdate() new_start_date,
'9999-12-31' new_end_date
from ods_mall_user_20220306) ods on zip.uid=ods.new_uid
union
select * from dim_mall_user_zip_bak zip left join
(select
uid,
login_name ,
nick_name ,
remark ,
create_time ,
update_time ,
is_del ,
curdate() start_date ,
'9999-12-31' end_date
from ods_mall_user_20220306 ) ods on zip.uid=ods.uid
) all_new
)t
where is_del=0
4.2将过期的数据插入插入当天过期分区表
create table dim_mall_user_zip_expire_20220306 like dim_mall_user_zip;
insert into dim_mall_user_zip_expire_20220306
select
uid,
login_name,
nick_name,
remark,
create_time,
update_time,
is_del,
start_date,
curdate() end_date
from (
select
*
from dim_mall_user_zip_bak zip right join
(select
uid new_uid,
login_name new_login_name,
nick_name new_nick_name,
remark new_remark,
create_time new_create_time,
update_time new_update_time,
is_del new_is_del,
curdate() new_start_date,
'9999-12-31' new_end_date
from ods_mall_user_20220306) ods on zip.uid=ods.new_uid
union
select * from dim_mall_user_zip_bak zip left join
(select
uid new_uid,
login_name new_login_name,
nick_name new_nick_name,
remark new_remark,
create_time new_create_time,
update_time new_update_time,
is_del new_is_del,
curdate() new_start_date,
'9999-12-31' new_end_date
from ods_mall_user_20220306 ) ods on zip.uid=ods.new_uid
where uid is not null and new_uid is not null
) all_new
where uid is not null and new_uid is not null;
4.3查看开链表和两张闭链表
select * from dim_mall_user_zip dmuz
union all
select * from dim_mall_user_zip_expire_20220305 dmuze
union all
select * from dim_mall_user_zip_expire_20220306 dmuze2
order by uid, end_date
5.对于已过期的数据和同时标记了删除的数据,实际上历史保留的是哪一条?
replace dim_mall_user_zip_expire_20220306
select
uid,
login_name,
nick_name,
remark,
create_time,
update_time,
is_del,
start_date,
curdate() end_date
from (
select
*
from dim_mall_user_zip_bak zip right join
(select
uid new_uid,
login_name new_login_name,
nick_name new_nick_name,
remark new_remark,
create_time new_create_time,
update_time new_update_time,
is_del new_is_del,
curdate() new_start_date,
'9999-12-31' new_end_date
from ods_mall_user_20220306) ods on zip.uid=ods.new_uid
union
select * from dim_mall_user_zip_bak zip left join
(select
uid new_uid,
login_name new_login_name,
nick_name new_nick_name,
remark new_remark,
create_time new_create_time,
update_time new_update_time,
is_del new_is_del,
curdate() new_start_date,
'9999-12-31' new_end_date
from ods_mall_user_20220306 ) ods on zip.uid=ods.new_uid
where uid is not null and new_uid is not null
) all_new
where uid is not null and new_uid is not null
union
select uid, login_name, nick_name, remark, create_time, update_time, is_del,
date_format(create_time, '%Y-%m-%d') start_date,
date_format(update_time, '%Y-%m-%d') end_date
from mall_user
where is_del=1
AND
(
create_time >= date_sub(curdate(), interval 1 day) and create_time < date_sub(curdate(), interval 0 day)
or
update_time >= date_sub(curdate(), interval 1 day) and update_time > date_sub(curdate(), interval 0 day)
);