目录
前言
现有一张订单表,每天抽取源数据中最新的数据先放入stg临时存储层中,表名为stg_order_tb,字段有:order_id(订单ID),create_date(订单创建时间),modify_date(订单修改时间),status(订单状态),订单状态有:创建、支付、完成、取消四个状态。
在ods层中也有一张订单表,按dp(active、expire)和end_date分区,active记录是当前有效的数据,expire记录的是无效的数据。比如某订单在某一天先创建、再支付、最后完成,那么当天的数据中该订单就会有三条记录,其中创建和支付的记录会放入expire分区中,完成状态的记录会放入active分区中。
因为stg层是全量抽取源数据的,所以每天导入ods层怎么判断stg层order表中的数据是更新或新增的呢?(此方案有待改进,后期再改进优化)
创建模拟数据库和表
首先我们先模拟创建两张表
create database stg;
create database ods;
create table stg.stg_order_tb (
order_id varchar(3),
create_date varchar(10),
modify_date varchar(10),
status string
)
row format delimited fields terminated by '\t';
create table ods.ods_order_tb (
order_id varchar(3),
create_date varchar(10),
modify_date varchar(10),
status string,
start_date varchar(10),
change_code string
)
partitioned by(dp string, end_date varchar(10))
row format delimited fields terminated by '\t';
第一种情况:stg层中的order表是增量抽取最新数据(插入方式是先truncate,再增量插入最新数据)
注意:第一次跑成功后不支持再跑
插入第一批模拟数据
在stg层中的order表中插入模拟数据
insert into stg.stg_order_tb values
("001", "2019-02-01", "2019-02-01", "订单创建"),
("002", "2019-02-01", "2019-02-01", "订单创建"),
("003", "2019-02-02", "2019-02-02", "订单创建"),
("004", "2019-02-02", "2019-02-02", "订单创建"),
("005", "2019-02-02", "2019-02-02", "订单创建"),
("006", "2019-02-02", "2019-02-02", "订单创建");
执行拉链操作脚本
在ods表没有数据的情况下,执行以下脚本,将最新记录的数据放入active分区中,将历史数据放入expire分区中(此时并没有历史数据)
#!/bin/bash
yesterday=`date -d last-day +%F`
/opt/modules/hive-3.1.1/bin/hive -e "
--set hive.mapred.mode=nonstrict;
--set hive.strict.checks.cartesian.product=false;
from (select * from ods.ods_order_tb where dp='active') old
full join
(select
order_id,
create_date,
modify_date,
status,
md5(
concat(
coalesce(order_id,'NULL'),
coalesce(modify_date,'NULL'),
coalesce(status,'NULL')
)
) as change_code
from stg.stg_order_tb ) new
on old.order_id=new.order_id
insert overwrite table ods.ods_order_tb partition (dp='active', end_date='9999-12-31')
select
new.order_id,
new.create_date,
new.modify_date,
new.status,
(case when old.start_date is null then cast(date_sub(current_date,1) as varchar(10)) else old.start_date end ) as start_date,
new.change_code
where (old.order_id is null and new.order_id is not null) --如果ods层的order表没数据但stg层中有,将stg层的数据插入ods中
or (old.order_id is not null and new.order_id is not null) --如果ods层和stg层都有数据,将stg层的数据插入ods中
insert into table ods.ods_order_tb partition (dp='active', end_date='9999-12-31')
select
old.order_id,
old.create_date,
old.modify_date,
old.status,
(case when old.start_date is null then cast(date_sub(current_date,1) as varchar(10)) else old.start_date end ) as start_date,
old.change_code
where old.order_id is not null and new.order_id is null --如果ods层有数据,但stg层没有数据,将ods层的数据留下来
insert overwrite table ods.ods_order_tb partition (dp='expire', end_date='${yesterday}')
select
old.order_id,
old.create_date,
old.modify_date,
old.status,
old.start_date,
old.change_code
--如果ods和stg层都有数据,但change_code不一样,那将ods中active分区的数据插入expire分区中
where (old.order_id is not null and new.order_id is not null and old.change_code <> new.change_code);
"
查看ods层的order表情况
select * from ods.ods_order_tb where dp='active';
OK
order_id create_date modify_date status start_date change_code dp end_date
001 2019-02-01 2019-02-01 订单创建 2019-02-23 2db049a78acd53d290c3694a46201d2a active 9999-12-31
002 2019-02-01 2019-02-01 订单创建 2019-02-23 8e809cf9c14995f194e353ae7fcdb971 active 9999-12-31
003 2019-02-02 2019-02-02 订单创建 2019-02-23 c7f68148b9acb96cedf00ea28fdfd5d2 active 9999-12-31
004 2019-02-02 2019-02-02 订单创建 2019-02-23 e40bc9130ca7dd263ab2eb36ebcd8fc4 active 9999-12-31
005 2019-02-02 2019-02-02 订单创建 2019-02-23 4ddf8c77e0b5fa917fd36d295c0e1489 active 9999-12-31
006 2019-02-02 2019-02-02 订单创建 2019-02-23 c0db242a0ecb78b711e0939d00c9e2ff active 9999-12-31
Time taken: 0.961 seconds, Fetched: 6 row(s)
select * from ods.ods_order_tb where dp='expire';
OK
order_id create_date modify_date status start_date change_code dp end_date
Time taken: 0.27 seconds
发现stg层的order表数据已经插入了ods层order表的avtive分区中,但expire分区没有数据
更新stg层order表数据
truncate table stg.stg_order_tb;
insert into stg.stg_order_tb values
("001", "2019-02-01", "2019-02-03", "订单支付"),
("002", "2019-02-01", "2019-02-03", "订单支付"),
("003", "2019-02-02", "2019-02-03", "订单支付"),
("007", "2019-02-03", "2019-02-03", "订单创建"),
("008", "2019-02-03", "2019-02-03", "订单创建"),
("009", "2019-02-03", "2019-02-03", "订单创建");
select * from stg.stg_order_tb;
OK
order_id create_date modify_date status
001 2019-02-01 2019-02-03 订单支付
002 2019-02-01 2019-02-03 订单支付
003 2019-02-02 2019-02-03 订单支付
007 2019-02-03 2019-02-03 订单创建
008 2019-02-03 2019-02-03 订单创建
009 2019-02-03 2019-02-03 订单创建
Time taken: 0.268 seconds, Fetched: 6 row(s)
再次执行脚本后查看ods层order表数据情况
select * from ods.ods_order_tb where dp='active';
OK
order_id create_date modify_date status start_date change_code dp end_date
001 2019-02-01 2019-02-03 订单支付 2019-02-23 afd73500929403c17bc3078a4467c266 active 9999-12-31
002 2019-02-01 2019-02-03 订单支付 2019-02-23 7110b776a00c26919ae3e763ea87dcbd active 9999-12-31
003 2019-02-02 2019-02-03 订单支付 2019-02-23 ba24f500616a5feb4934622d575771b6 active 9999-12-31
007 2019-02-03 2019-02-03 订单创建 2019-02-23 ba0ba16c25d930b38ba540dbbc43793e active 9999-12-31
008 2019-02-03 2019-02-03 订单创建 2019-02-23 c47b3a308c928067566a915a70c4c14e active 9999-12-31
009 2019-02-03 2019-02-03 订单创建 2019-02-23 0063c07b4f40a561da6c356408446667 active 9999-12-31
004 2019-02-02 2019-02-02 订单创建 2019-02-23 e40bc9130ca7dd263ab2eb36ebcd8fc4 active 9999-12-31
005 2019-02-02 2019-02-02 订单创建 2019-02-23 4ddf8c77e0b5fa917fd36d295c0e1489 active 9999-12-31
006 2019-02-02 2019-02-02 订单创建 2019-02-23 c0db242a0ecb78b711e0939d00c9e2ff active 9999-12-31
Time taken: 4.323 seconds, Fetched: 9 row(s)
select * from ods.ods_order_tb where dp='expire';
OK
order_id create_date modify_date status start_date change_code dp end_date
001 2019-02-01 2019-02-01 订单创建 2019-02-23 2db049a78acd53d290c3694a46201d2a expire 2019-02-23
002 2019-02-01 2019-02-01 订单创建 2019-02-23 8e809cf9c14995f194e353ae7fcdb971 expire 2019-02-23
003 2019-02-02 2019-02-02 订单创建 2019-02-23 c7f68148b9acb96cedf00ea28fdfd5d2 expire 2019-02-23
Time taken: 0.36 seconds, Fetched: 3 row(s)
在active分区中的1/2/3订单状态变更了支付,4/5/6订单状态还是创建,新增了7/8/9订单的创建记录
在expire分区中新增了1/2/3订单创建状态的记录
第二种情况:stg层的数据是全量抽取最新数据(先truncate表,再全量插入最新数据)
注意:第一次跑成功后不支持再跑
清空stg和ods层order表的数据再插入模拟数据
truncate table stg.stg_order_tb;
truncate table ods.ods_order_tb;
insert into stg.stg_order_tb values
("001", "2019-02-01", "2019-02-01", "订单创建"),
("002", "2019-02-01", "2019-02-01", "订单创建"),
("003", "2019-02-02", "2019-02-02", "订单创建"),
("004", "2019-02-02", "2019-02-02", "订单创建"),
("005", "2019-02-02", "2019-02-02", "订单创建"),
("006", "2019-02-02", "2019-02-02", "订单创建");
执行脚本
#!/bin/bash
yesterday=`date -d last-day +%F`
/opt/modules/hive-3.1.1/bin/hive -e "
--set hive.mapred.mode=nonstrict;
--set hive.strict.checks.cartesian.product=false;
from (select * from ods.ods_order_tb where dp ='active') old
full join
(select
order_id,
create_date,
modify_date,
status,
md5(
concat(
coalesce(order_id,'NULL'),
coalesce(modify_date,'NULL'),
coalesce(status,'NULL')
)
) as change_code
from stg.stg_order_tb ) new
on old.order_id=new.order_id
insert overwrite table ods.ods_order_tb partition (dp='active', end_date='9999-12-31')
select
new.order_id,
new.create_date,
new.modify_date,
new.status,
(case when old.start_date is null then cast(date_sub(current_date,1) as varchar(10)) else old.start_date end ) as start_date,
new.change_code
where (old.order_id is null and new.order_id is not null) --如果ods层的order表没数据但stg层中有,将stg层的数据插入ods中
or (old.order_id is not null and new.order_id is not null) --如果ods层和stg层都有数据,将stg层的数据插入ods中
insert overwrite table ods.ods_order_tb partition (dp='expire', end_date='${yesterday}')
select
old.order_id,
old.create_date,
old.modify_date,
old.status,
old.start_date,
old.change_code
--如果ods和stg层都有数据,但change_code不一样,那将ods中active分区的数据插入expire分区中
where (old.order_id is not null and new.order_id is not null and old.change_code <> new.change_code);
"
查看ods层的数据情况
select * from ods.ods_order_tb where dp='active';
OK
order_id create_date modify_date status start_date change_code dp end_date
001 2019-02-01 2019-02-01 订单创建 2019-02-23 2db049a78acd53d290c3694a46201d2a active 9999-12-31
002 2019-02-01 2019-02-01 订单创建 2019-02-23 8e809cf9c14995f194e353ae7fcdb971 active 9999-12-31
003 2019-02-02 2019-02-02 订单创建 2019-02-23 c7f68148b9acb96cedf00ea28fdfd5d2 active 9999-12-31
004 2019-02-02 2019-02-02 订单创建 2019-02-23 e40bc9130ca7dd263ab2eb36ebcd8fc4 active 9999-12-31
005 2019-02-02 2019-02-02 订单创建 2019-02-23 4ddf8c77e0b5fa917fd36d295c0e1489 active 9999-12-31
006 2019-02-02 2019-02-02 订单创建 2019-02-23 c0db242a0ecb78b711e0939d00c9e2ff active 9999-12-31
Time taken: 0.642 seconds, Fetched: 6 row(s)
select * from ods.ods_order_tb where dp='expire';
OK
order_id create_date modify_date status start_date change_code dp end_date
Time taken: 0.228 seconds
清空stg表中数据并插入全量数据
truncate table stg.stg_order_tb;
insert into stg.stg_order_tb values
("001", "2019-02-01", "2019-02-03", "订单支付"),
("002", "2019-02-01", "2019-02-03", "订单支付"),
("003", "2019-02-02", "2019-02-03", "订单支付"),
("004", "2019-02-02", "2019-02-02", "订单创建"),
("005", "2019-02-02", "2019-02-02", "订单创建"),
("006", "2019-02-02", "2019-02-02", "订单创建"),
("007", "2019-02-03", "2019-02-03", "订单创建"),
("008", "2019-02-03", "2019-02-03", "订单创建"),
("009", "2019-02-03", "2019-02-03", "订单创建");
再次执行脚本并查看数据
select * from ods.ods_order_tb where dp='active';
OK
order_id create_date modify_date status start_date change_code dp end_date
001 2019-02-01 2019-02-03 订单支付 2019-02-23 afd73500929403c17bc3078a4467c266 active 9999-12-31
002 2019-02-01 2019-02-03 订单支付 2019-02-23 7110b776a00c26919ae3e763ea87dcbd active 9999-12-31
003 2019-02-02 2019-02-03 订单支付 2019-02-23 ba24f500616a5feb4934622d575771b6 active 9999-12-31
004 2019-02-02 2019-02-02 订单创建 2019-02-23 e40bc9130ca7dd263ab2eb36ebcd8fc4 active 9999-12-31
005 2019-02-02 2019-02-02 订单创建 2019-02-23 4ddf8c77e0b5fa917fd36d295c0e1489 active 9999-12-31
006 2019-02-02 2019-02-02 订单创建 2019-02-23 c0db242a0ecb78b711e0939d00c9e2ff active 9999-12-31
007 2019-02-03 2019-02-03 订单创建 2019-02-23 ba0ba16c25d930b38ba540dbbc43793e active 9999-12-31
008 2019-02-03 2019-02-03 订单创建 2019-02-23 c47b3a308c928067566a915a70c4c14e active 9999-12-31
009 2019-02-03 2019-02-03 订单创建 2019-02-23 0063c07b4f40a561da6c356408446667 active 9999-12-31
Time taken: 2.618 seconds, Fetched: 9 row(s)
select * from ods.ods_order_tb where dp='expire';
OK
order_id create_date modify_date status start_date change_code dp end_date
001 2019-02-01 2019-02-01 订单创建 2019-02-23 2db049a78acd53d290c3694a46201d2a expire 2019-02-23
002 2019-02-01 2019-02-01 订单创建 2019-02-23 8e809cf9c14995f194e353ae7fcdb971 expire 2019-02-23
003 2019-02-02 2019-02-02 订单创建 2019-02-23 c7f68148b9acb96cedf00ea28fdfd5d2 expire 2019-02-23
Time taken: 0.259 seconds, Fetched: 3 row(s)
在active分区中的1/2/3订单状态变更了支付,4/5/6订单状态还是创建,新增了7/8/9订单的创建记录
在expire分区中新增了1/2/3订单创建状态的记录