拉链表思路
2号拉链
1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31
3号增量
1,ab
3,cc
1:拉链表是将数据更加简化,让节约数仓储存空间让数据更加利于计算
计算逻辑,如 1 号用户ae假设为加入购物车,.那么当它加入购物车的时候就给出2020-08-25 到9999-12-31
那么如果它有下一次操作,就以当天操作为start_time 成2020-09-01 到9999-12-31 生成 aa为已支付状态 这属于历史的拉链表,
那么新增了一条,1ab,将今天2020-09-03作为start_time.end_time作为9999-12-31
-- 拉链表实战
-- 1.创建一张用于存储每日增量数据的表
drop table test.stu_newdata;
CREATE TABLE test.stu_newdata(
id int,
name string,
age string,
update_time timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;
-- 2.创建一张全量每日快照表
drop table test.stu;
CREATE TABLE test.stu(
id int,
name string,
age string,
update_time timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;
-- 3. 创建一张拉链表
drop table test.stu_zip;
CREATE TABLE test.stu_zip(
id int,
name string,
age string,
update_time bigint,
start_dt string,
end_dt string
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;
-- 记录有效起始日
-- 记录有效结束日
-- 操作手册
-- 操作初始日期:2020-09-03
# 1. 利用sqoop,初次导入2020-09-03日,业务库中的表
bin/sqoop import \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123 \
--table stu \
--target-dir '/sqoopdata/incr/stu/2020-09-03' \
--incremental lastmodified \
--check-column update_time \
--last-value '2020-09-01 00:00:00' \
--fields-terminated-by '\001' \
--as-textfile \
-m 1
# 2. 将本次导入的增量数据,load到hive的增量表中
load data inpath '/sqoopdata/incr/stu/2020-09-03' into table test.stu_newdata partition(dt='2020-09-03');
因为是在文件夹中提取数据,没有Schema信息所有需要一个jar包,classbean.
# 3.生成2020-09-03日的全量表
生成class
#bin/sqoop codegen \
--connect jdbc:mysql://impala01:3306/sqooptest \
--username root \
--password ABC123abc.123 \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--fields-terminated-by ","
bin/sqoop codegen \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123 \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--fields-terminated-by '\001'
## 模拟准备全量表的2020-09-02分区
insert into table stu partition(dt='2020-09-02') select 0,'xx',10,'2020-09-01 15:34:55.0';
2020-09-01 15:34:55.0
## 将增量表的2020-09-03数据跟全量表的2020-09-02,合并成全量表的2020-09-03数据
bin/sqoop merge \
--new-data /user/hive/warehouse/test.db/stu_newdata/dt=2020-09-03 \
--onto /user/hive/warehouse/test.db/stu/dt=2020-09-02 \
--target-dir /user/hive/warehouse/test.db/stu/dt=2020-09-03 \
--jar-file /opt/apps/code/stu/Stu.jar \
--class-name Stu \
--merge-key id
## 将合并好的数据,导入全量表的2020-09-03分区
hive> alter table stu add partition(dt='2020-09-03') location '/user/hive/warehouse/test.db/stu/dt=2020-09-03';
# 4.根据20220-09-03的增量,和2020-09-02的拉链表,生成2020-09-03的拉链表
--假设 2020-09-02的拉链表如下:
0,'xx',10,'2020-09-01 15:34:55.0',2020-09-01,9999-12-31
-- 2020-09-03的增量数据:
| 1 | aa | 18 | 2020-09-01 15:34:55.0 | 2020-09-03 |
| 2 | bbb | 38 | 2020-09-03 13:35:08.0 | 2020-09-03 |
| 3 | cc | 24 | 2020-09-02 14:35:14.0 | 2020-09-03 |
| 4 | ddd | 29 | 2020-09-03 10:40:17.0 | 2020-09-03 |
| 5 | ee | 28 | 2020-09-02 14:40:29.0 | 2020-09-03 |
| 6 | gg | 22 | 2020-09-03 13:46:23.0 | 2020-09-03 |
| 7 | jj | 24 | 2020-09-03 16:46:36.0 | 2020-09-03 |
| 8 | 胡磊 | 33 | 2020-09-03 17:15:11.0 | 2020-09-03 |
+---------+-----------+----------+------------------------+-------------+
insert into table test.stu_zip partition(dt='2020-09-03')
select
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
case when a.end_dt='9999-12-31' and b.id is not null then a.dt
else a.end_dt
end as end_dt
from
stu_zip a left join lalian b on
a.id=b.id
and a.dt='2020-09-02' and b.dt='2020-09-03'
union all
select
id,
name,
age,
update_time,
dt as start_dt,
'9999-12-31' as end_dt
from
stu_zip where dt='2020-09-03' order by id
alter table stu_zip drop partition(dt='2020-09-02');
2号拉链
1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31
3号增量
1,ab
3,cc
3号拉链:
1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,2020-09-02
1,ab,2020-09-03,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31
3,cc,2020-09-03,9999-12-31
drop table lalian2;
CREATE TABLE test.lalian2(
id int,
name string,
int_time string,
out_time string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;
load data local inpath '/root/lalian2.txt' into table test.lalian2 partition (dt='2020-09-03');
drop table lalian;
CREATE TABLE test.lalian(
id int,
name string,
int_time string,
out_time string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;
load data local inpath '/root/lalian1.txt' into table test.lalian partition(dt='2020-09-02');
select
a.*,
b.*
from
lalian
a
left join
lalian2 b on a.id=b.id
+-------+---------+-------------+-------------+-------------+-------+---------+-------------+-------------+-------------+
| a.id | a.name | a.int_time | a.out_time | a.dt | b.id | b.name | b.int_time | b.out_time | b.dt |
+-------+---------+-------------+-------------+-------------+-------+---------+-------------+-------------+-------------+
| 1 | ae | 2020-08-25 | 2020-08-31 | 2020-09-02 | 1 | ab | NULL | NULL | 2020-09-03 |
| 1 | aa | 2020-09-01 | 9999-12-31 | 2020-09-02 | 1 | ab | NULL | NULL | 2020-09-03 |
| 2 | bb | 2020-09-01 | 2020-09-01 | 2020-09-02 | NULL | NULL | NULL | NULL | NULL |
| 2 | ba | 2020-09-02 | 9999-12-31 | 2020-09-02 | NULL | NULL | NULL | NULL | NULL |
+-------+---------+-------------+-------------+-------------+-------+---------+-------------+-------------+-------------+
+-------+---------+-------------+-------------+
| a.id | a.name | a.int_time | out_time |
+-------+---------+-------------+-------------+
| 1 | ae | 2020-08-25 | 2020-08-31 |
| 1 | aa | 2020-09-01 | 2020-09-02 |
| 2 | bb | 2020-09-01 | 2020-09-01 |
| 2 | ba | 2020-09-02 | 9999-12-31 |
+-------+---------+-------------+-------------
+---------+-----------+---------------+---------------+
| _u1.id | _u1.name | _u1.int_time | _u1.out_time |
+---------+-----------+---------------+---------------+
| 1 | ae | 2020-08-25 | 2020-08-31 |
| 1 | aa | 2020-09-01 | 2020-09-02 |
| 1 | ab | 2020-09-03 | 9999-12-31 |
| 2 | bb | 2020-09-01 | 2020-09-01 |
2 | ba | 2020-09-02 | 9999-12-31 |
| 3 | cc | 2020-09-03 | 9999-12-31 |
+---------+-----------+---------------+---------------+
insert into table test.stu_zip partition(dt='2020-09-03')
select
a.id,a.name,a.int_time,
case when a.out_time='9999-12-31' and b.id is not null then a.dt
else a.out_time
end as out_time
from
stu
a
left join
stu_newdata b on a.id=b.id
and a.dt='2020-09-02' and b.dt='2020-09-03'
UNION all
select
id as id,
name as name,
dt as int_time,
'9999-12-31' as out_time
from
stu_newdata
where dt='2020-09-03' order by id
insert into table test.stu_zip partition(dt='2020-09-03')
select
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
case when a.end_dt='9999-12-31' and b.id is not null then a.dt
else a.end_dt
end as end_dt
from
stu_zip a left join lalian b on
a.id=b.id
and a.dt='2020-09-02' and b.dt='2020-09-03'
union all
select
id,
name,
age,
update_time,
dt as start_dt,
'9999-12-31' as end_dt
from
stu_zip where dt='2020-09-03' order by id
alter table stu_zip drop partition(dt='2020-09-02');
1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,2020-09-02
1,ab,2020-09-03,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31
3,cc,2020-09-03,9999-12-31
-- 拉链表实战
-- 1.创建一张用于存储每日增量数据的表
drop table test.stu_newdata;
CREATE TABLE test.stu_newdata(
id int,
name string,
age string,
update_time timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;
-- 2.创建一张全量每日快照表
drop table test.stu;
CREATE TABLE test.stu(
id int,
name string,
age string,
update_time timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;
-- 3. 创建一张拉链表
drop table test.stu_zip;
CREATE TABLE test.stu_zip(
id int,
name string,
age string,
update_time timestamp,
start_dt string, -- 记录有效起始日
end_dt string -- 记录有效结束日
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;
-- 操作手册
-- 操作初始日期:2020-09-03
# 1. 利用sqoop,初次导入2020-09-03日,业务库中的表
bin/sqoop import \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123 \
--table stu \
--target-dir '/sqoopdata/incr/stu/2020-09-03' \
--incremental lastmodified \
--check-column update_time \
--last-value '2020-09-01 00:00:00' \
--fields-terminated-by '\001' \
--as-textfile \
-m 1
# 2. 将本次导入的增量数据,load到hive的增量表中
load data inpath '/sqoopdata/incr/stu/2020-09-03' into table test.stu_newdata partition(dt='2020-09-03');
# 3.生成2020-09-03日的全量表
bin/sqoop codegen \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123 \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--fields-terminated-by '\001'
## 模拟准备全量表的2020-09-02分区
insert into table stu partition(dt='2020-09-02')
select 0,'xx',10,'2020-09-01 15:34:55.0'
;
## 将增量表的2020-09-03数据跟全量表的2020-09-02,合并成全量表的2020-09-03数据
bin/sqoop merge \
--new-data /user/hive/warehouse/test.db/stu_newdata/dt=2020-09-03 \
--onto /user/hive/warehouse/test.db/stu/dt=2020-09-02 \
--target-dir /user/hive/warehouse/test.db/stu/dt=2020-09-03 \
--jar-file /opt/apps/code/stu/Stu.jar \
--class-name Stu \
--merge-key id
## 将合并好的数据,导入全量表的2020-09-03分区
hive> alter table stu add partition(dt='2020-09-03') location '/user/hive/warehouse/test.db/stu/dt=2020-09-03';
# 4.根据20220-09-03的增量,和2020-09-02的拉链表,生成2020-09-03的拉链表
--假设 2020-09-02的拉链表如下:
insert into table test.stu_zip partition(dt='2020-09-02')
select
0,'xx',10,'2020-09-01 15:34:55.0','2020-09-01','9999-12-31'
;
-- 2020-09-03的增量数据:
| 1 | aa | 18 | 2020-09-01 15:34:55.0 | 2020-09-03 |
| 2 | bbb | 38 | 2020-09-03 13:35:08.0 | 2020-09-03 |
| 3 | cc | 24 | 2020-09-02 14:35:14.0 | 2020-09-03 |
| 4 | ddd | 29 | 2020-09-03 10:40:17.0 | 2020-09-03 |
| 5 | ee | 28 | 2020-09-02 14:40:29.0 | 2020-09-03 |
| 6 | gg | 22 | 2020-09-03 13:46:23.0 | 2020-09-03 |
| 7 | jj | 24 | 2020-09-03 16:46:36.0 | 2020-09-03 |
| 8 | 胡磊 | 33 | 2020-09-03 17:15:11.0 | 2020-09-03 |
+---------+-----------+----------+------------------------+-------------+
-- 2020-09-03 拉链表更新
with zip as (
select
id,
name,
age,
update_time,
start_dt,
end_dt,
dt
from stu_zip where dt='2020-09-02'
),
newdata as (
select
id,
name,
age,
update_time,
dt
from stu_newdata where dt='2020-09-03'
)
INSERT INTO TABLE test.stu_zip PARTITION (dt='2020-09-03')
SELECT
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
if(a.end_dt='9999-12-31' and b.id is not null,a.dt,a.end_dt) as end_dt
FROM zip a left join newdata b on a.id=b.id
UNION ALL
SELECT
id,
name,
age,
update_time,
cast(to_date(update_time) as string) as start_dt,
'9999-12-31' as end_dt
FROM newdata
;
-- 验证准确性
1. 在mysql中的stu表中,再更新几条2020-09-04的数据
+----------------------------------+
| 0刘爽552020-09-04 15:34:55.0 |
| 7叶成岭222020-09-04 16:46:36.0 |
| 9闫丽站182020-09-04 09:34:30.0 |
| 10庞艳勋222020-09-04 09:34:58.0 |
+----------------------------------+
2. 用sqoop导入2020-09-04的增量数据
bin/sqoop import \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123 \
--table stu \
--target-dir '/sqoopdata/incr/stu/2020-09-04' \
--incremental lastmodified \
--check-column update_time \
--last-value '2020-09-04 00:00:00' \
--fields-terminated-by '\001' \
--as-textfile \
-m 1
3. 将增量数据文件,load入hive中的stu增量数据表
load data inpath '/sqoopdata/incr/stu/2020-09-04' into table test.stu_newdata partition(dt='2020-09-04');
4. 更新2020-09-04的拉链表
with zip as (
select
id,
name,
age,
update_time,
start_dt,
end_dt,
dt
from stu_zip where dt='2020-09-03'
),
newdata as (
select
id,
name,
age,
update_time,
dt
from stu_newdata where dt='2020-09-04'
)
INSERT INTO TABLE test.stu_zip PARTITION (dt='2020-09-04')
SELECT
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
if(a.end_dt='9999-12-31' and b.id is not null,a.dt,a.end_dt) as end_dt
FROM zip a left join newdata b on a.id=b.id
UNION ALL
SELECT
id,
name,
age,
update_time,
cast(to_date(update_time) as string) as start_dt,
'9999-12-31' as end_dt
FROM newdata
;