–第一步:
数据库源表orders
create table orders(
order_id int,
createtime string,
modifiedtime string,
status string
)
20190821开始做拉链表 数据如下
±--------------------------------±----------------------------------±------------------------------------±------------------------------±-+
| orders.order_id | orders.createtime | orders.modifiedtime | orders.status |
±--------------------------------±----------------------------------±------------------------------------±------------------------------±-+
| 1 | 2019-08-18 | 2019-08-18 | 创建 |
| 2 | 2019-08-18 | 2019-08-18 | 创建 |
| 3 | 2019-08-19 | 2019-08-21 | 支付 |
| 4 | 2019-08-19 | 2019-08-21 | 完成 |
| 5 | 2019-08-19 | 2019-08-20 | 支付 |
| 6 | 2019-08-20 | 2019-08-20 | 创建 |
| 7 | 2019-08-20 | 2019-08-21 | 支付 |
| 8 | 2019-08-21 | 2019-08-21 | 创建 |
±--------------------------------±----------------------------------±------------------------------------±------------------------------±-+
– 创建 ods 层增量数据表,按天分区,存放每天的增量数据
create table ods_order_inc(
order_id int,
createtime string,
modifiedtime string,
status string
)
partitioned by (day string)
row format delimited fields terminated by ',' stored as TEXTFILE;
–进行全量的初始化(这个sql只执行一次)
INSERT overwrite TABLE ods_order_inc PARTITION (day ="20190821")
SELECT order_id,createtime,modifiedtime,status
FROM orders --注意:order表在生产环境中应该是数据库中的表,使用工具将数据抽取到ods_order_inc,这里为了简便直接使用了查询
WHERE createtime <= '20190821';
– dw 拉链表建表语句
create table dw_order_his(
order_id int,
createtime string,
modifiedtime string,
status string,
dw_start_date string,
dw_end_date string
)
row format delimited fields terminated by ',' stored as TEXTFILE;
--将数据加入到拉链表
INSERT overwrite TABLE dw_order_his
SELECT order_id,createtime,modifiedtime,status,
createtime AS dw_start_date,
'9999-12-31' AS dw_end_date
FROM ods_order_inc
WHERE day = '20190821';
–第二步:
–将2019-08-22好的数据增量同步到ods_order_inc分区表
INSERT overwrite TABLE ods_order_inc PARTITION (day = '20190822')
SELECT order_id,createtime,modifiedtime,status
FROM orders
WHERE createtime = '2019-08-22' OR modifiedtime = '2019-08-22';
–例如:数据如下
±----------±------------±--------------±--------±-+
| order_id | createtime | modifiedtime | status |
±----------±------------±--------------±--------±-+
| 1 | 2019-08-18 | 2019-08-22 | 支付 |
| 2 | 2019-08-18 | 2019-08-22 | 完成 |
| 6 | 2019-08-20 | 2019-08-22 | 支付 |
| 8 | 2019-08-21 | 2019-08-22 | 支付 |
| 9 | 2019-08-22 | 2019-08-22 | 创建 |
| 10 | 2019-08-22 | 2019-08-22 | 支付 |
±----------±------------±--------------±--------±-+
----第三步:
–创建dw_orders_his_tmp临时表
–更新sql分为两部分,
–1 第一部分 ,使用拉链表(里面是20190821以前的数据)left join ods_order_inc 20190822 号的数据 关联的上的说明,20190822 更新了20190821号的数据,将这些数据的结束时间改为20190821,数据追加到dw_orders_his_tmp
–2 第二部分,直接将ods_order_inc 中 20190822 中的数据追加到dw_orders_his_tmp,
DROP TABLE IF EXISTS dw_orders_his_tmp;
CREATE TABLE dw_order_his_tmp
AS
select * from (
SELECT
order_id order_id,
createtime,
modifiedtime,
status,
dw_start_date,
dw_end_date
FROM
(SELECT
a.order_id,
a.createtime,
a.modifiedtime,
a.status,
a.dw_start_date,
CASE WHEN b.order_id IS NOT NULL AND a.dw_end_date > '2019-08-22' THEN '2019-08-21' ELSE a.dw_end_date END AS dw_end_date
FROM dw_order_his a
left outer join (SELECT * FROM ods_order_inc WHERE day = '20190822') b
ON (a.order_id = b.order_id))
union all
SELECT
order_id order_id,
createtime,
modifiedtime,
status,
modifiedtime AS dw_start_date,
'9999-12-31' AS dw_end_date
FROM ods_order_inc
WHERE day = '20190822')t ORDER BY order_id,dw_start_date;
----第四步:
–更新拉链表
INSERT overwrite TABLE dw_order_his
SELECT * FROM dw_order_his_tmp;
结果如下
±-----------------------±-------------------------±---------------------------±---------------------±----------------------------±--------------------------±-+
| dw_order_his.order_id | dw_order_his.createtime | dw_order_his.modifiedtime | dw_order_his.status | dw_order_his.dw_start_date | dw_order_his.dw_end_date |
±-----------------------±-------------------------±---------------------------±---------------------±----------------------------±--------------------------±-+
| 1 | 2019-08-18 | 2019-08-18 | 创建 | 2019-08-18 | 2019-08-21 |
| 1 | 2019-08-18 | 2019-08-22 | 支付 | 2019-08-22 | 9999-12-31 |
| 2 | 2019-08-18 | 2019-08-18 | 创建 | 2019-08-18 | 2019-08-21 |
| 2 | 2019-08-18 | 2019-08-22 | 完成 | 2019-08-22 | 9999-12-31 |
| 3 | 2019-08-19 | 2019-08-21 | 支付 | 2019-08-19 | 9999-12-31 |
| 4 | 2019-08-19 | 2019-08-21 | 完成 | 2019-08-19 | 9999-12-31 |
| 5 | 2019-08-19 | 2019-08-20 | 支付 | 2019-08-19 | 9999-12-31 |
| 6 | 2019-08-20 | 2019-08-20 | 创建 | 2019-08-20 | 2019-08-21 |
| 6 | 2019-08-20 | 2019-08-22 | 支付 | 2019-08-22 | 9999-12-31 |
| 7 | 2019-08-20 | 2019-08-21 | 支付 | 2019-08-20 | 9999-12-31 |
| 8 | 2019-08-21 | 2019-08-21 | 创建 | 2019-08-21 | 2019-08-21 |
| 8 | 2019-08-21 | 2019-08-22 | 支付 | 2019-08-22 | 9999-12-31 |
| 9 | 2019-08-22 | 2019-08-22 | 创建 | 2019-08-22 | 9999-12-31 |
| 10 | 2019-08-22 | 2019-08-22 | 支付 | 2019-08-22 | 9999-12-31 |
±-----------------------±-------------------------±---------------------------±---------------------±----------------------------±--------------------------±-+
每天运行第二步到第四步即完成了拉链表的制作