一、创建表并加载数据
1、创建商家维表
drop table if exists dim.dim_trade_shops_org;
create table dim.dim_trade_shops_org(
shopid int COMMENT '商家id',
shopName string COMMENT '商家名称',
cityId int COMMENT '城市id',
cityName string COMMENT '城市名称',
regionId int COMMENT '地域id',
regionName string COMMENT '地域名称')
COMMENT '商家维表'
partitioned by (dt string)
STORED AS PARQUET;
2、创建商家维表拉链表
--不是分区表
DROP TABLE IF EXISTS dim.dim_trade_shops_org_his;
CREATE TABLE dim.dim_trade_shops_org_his(
shopid int COMMENT '商家id',
shopName string COMMENT '商家名称',
cityId int COMMENT '城市id',
cityName string COMMENT '城市名称',
regionId int COMMENT '地域id',
regionName string COMMENT '地域名称'
start_date STRING,
end_date STRING)
COMMENT '用户信息拉链表'
row format delimited fields terminated by ',';
3、加载数据到商家维表
-- 动态分区数据加载:分区的值是不固定的,由输入数据确定
-- 创建中间表(非分区表)
drop table if exists test.tmp1;
create table test.tmp1 as
select * from dim.dim_trade_shops_org;
-- tmp1 非分区表,使用系统默认的字段分割符'\001'
alter table test.tmp1 set serdeproperties('field.delim'=',');
-- 向中间表加载数据
load data local inpath '/data/lagoudw/data/trade_shops.dat' into table test.tmp1;
-- 从中间表向分区表加载数据
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition = true;
insert into table dim.dim_trade_shops_org partition(dt)
select * from test.tmp1;
二、拉链表的实现
1、初始化拉链表(只需要执行一次)
insert overwrite table dim.dim_trade_shops_org_his
select shopid,
shopName,
cityId,
cityName,
regionId,
regionName,
dt as start_date,
"9999-12-31" as end_date
from dim.dim_trade_shops_org
2、构建拉链表(每天执行一次)
insert overwrite table dim.dim_trade_shops_org_his
--新增数据
select shopid,
shopName,
cityId,
cityName,
regionId,
regionName,
dt as start_date,
"9999-12-31" as end_date
from dim.dim_trade_shops_org
where dt = '2021-07-14'
--历史数据
union all
SELECT A.shopid,
A.shopName,
A.cityId,
A.cityName,
A.regionId,
A.regionName,
A.start_date,
CASE WHEN A.end_date = "9999-12-31" and B.shopid is not null
then date_add('2021-07-14',-1)
else A.end_date
end as end_date
FROM
(select * from dim.dim_trade_shops_org_his) A
left join
(select * from dim.dim_trade_shops_org) B
on A.shopid = B.shopid
3、处理拉链表的脚本
#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert overwrite table dim.dim_trade_shops_org_his
select shopid,
shopName,
cityId,
cityName,
regionId,
regionName,
dt as start_date,
'9999-12-31' as end_date
from dim.dim_trade_shops_org
where dt = '$do_date'
union all
SELECT A.shopid,
A.shopName,
A.cityId,
A.cityName,
A.regionId,
A.regionName,
A.start_date,
CASE WHEN A.end_date = '9999-12-31' and B.shopid is not null
then date_add('$do_date',-1)
else A.end_date
end as end_date
FROM
(select * from dim.dim_trade_shops_org_his) A
left join
(select * from dim.dim_trade_shops_org) B
on A.shopid = B.shopid
"
hive -e "$sql"
三、拉链表的回滚
- end_date < rollback_date,即结束日期 < 回滚日期,表示该行数据在rollback_date 之前产生,这些数据需要原样保留;
- start_date <= rollback_date <= end_date,即开始日期 <= 回滚日期 <= 结束日期,这些数据是回滚日期之后产生的,但是需要修改,将end_date 改为 9999-12-31;
- 其他数据不用管。
1、处理 end_date < rollback_date 的数据,保留
select *, '1' as tag
from dim.dim_trade_shops_org_his
where end_date < '2021-07-14';
2、处理 start_date <= rollback_date <= end_date 的数据,设置end_date=9999-12-31
select shopid,
shopName,
cityId,
cityName,
regionId,
regionName,
start_date,
'9999-12-31' as end_date,
'2' as tag
from dim.dim_trade_shops_org_his
where end_date >= '2021-07-14'
and start_date <= '2021-07-14'
3、将前面两步的数据写入临时表tmp(拉链表)
DROP TABLE IF EXISTS dim.dim_trade_shops_org_his_tmp;
create table dim.dim_trade_shops_org_his_tmp as
select *, '1' as tag
from dim.dim_trade_shops_org_his
where end_date < '2021-07-14'
union all
select shopid,
shopName,
cityId,
cityName,
regionId,
regionName,
start_date,
'9999-12-31' as end_date,
'2' as tag
from dim.dim_trade_shops_org_his
where end_date >= '2021-07-14'
and start_date <= '2021-07-14'
4、处理回滚拉链表的脚本
#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
DROP TABLE IF EXISTS dim.dim_trade_shops_org_his_tmp;
create table dim.dim_trade_shops_org_his_tmp as
select *, '1' as tag
from dim.dim_trade_shops_org_his
where end_date < '$do_date'
union all
select shopid,
shopName,
cityId,
cityName,
regionId,
regionName,
start_date,
'9999-12-31' as end_date,
'2' as tag
from dim.dim_trade_shops_org_his
where end_date >= '$do_date'
and start_date <= '$do_date'
"
hive -e "$sql"