拉链表

最新推荐文章于 2024-07-01 17:29:38 发布

茂密头发的源猴

最新推荐文章于 2024-07-01 17:29:38 发布

阅读量309

点赞数

文章标签： hive sqoop

本文链接：https://blog.csdn.net/weixin_48109576/article/details/108543209

版权

拉链表思路

2号拉链
1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31

3号增量
1,ab
3,cc

1:拉链表是将数据更加简化,让节约数仓储存空间让数据更加利于计算

计算逻辑,如 1 号用户ae假设为加入购物车,.那么当它加入购物车的时候就给出2020-08-25 到9999-12-31

那么如果它有下一次操作,就以当天操作为start_time 成2020-09-01 到9999-12-31 生成 aa为已支付状态这属于历史的拉链表,

那么新增了一条,1ab,将今天2020-09-03作为start_time.end_time作为9999-12-31

-- 拉链表实战

-- 1.创建一张用于存储每日增量数据的表
drop table test.stu_newdata;
CREATE TABLE test.stu_newdata(
id    int,
name  string,
age   string,
update_time  timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;

-- 2.创建一张全量每日快照表
drop table test.stu;
CREATE TABLE test.stu(
id    int,
name  string,
age   string,
update_time  timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;

-- 3. 创建一张拉链表
drop table test.stu_zip;
CREATE TABLE test.stu_zip(
id    int,
name  string,
age   string,
update_time  bigint,
start_dt    string,     
end_dt      string      
)                       
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;

-- 记录有效起始日
-- 记录有效结束日
-- 操作手册
-- 操作初始日期：2020-09-03

# 1. 利用sqoop，初次导入2020-09-03日，业务库中的表
bin/sqoop import \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123   \
--table  stu \
--target-dir '/sqoopdata/incr/stu/2020-09-03'  \
--incremental lastmodified \
--check-column update_time \
--last-value '2020-09-01 00:00:00'  \
--fields-terminated-by '\001' \
--as-textfile   \
-m 1


# 2. 将本次导入的增量数据，load到hive的增量表中
load data inpath '/sqoopdata/incr/stu/2020-09-03' into table test.stu_newdata partition(dt='2020-09-03');



因为是在文件夹中提取数据,没有Schema信息所有需要一个jar包,classbean.
# 3.生成2020-09-03日的全量表
生成class
#bin/sqoop codegen \
--connect jdbc:mysql://impala01:3306/sqooptest \
--username root \
--password ABC123abc.123 \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--fields-terminated-by ","

bin/sqoop codegen \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123   \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--fields-terminated-by '\001'

## 模拟准备全量表的2020-09-02分区
insert into table stu partition(dt='2020-09-02') select 0,'xx',10,'2020-09-01 15:34:55.0';
 
2020-09-01 15:34:55.0

## 将增量表的2020-09-03数据跟全量表的2020-09-02，合并成全量表的2020-09-03数据   
bin/sqoop merge \
--new-data /user/hive/warehouse/test.db/stu_newdata/dt=2020-09-03 \
--onto /user/hive/warehouse/test.db/stu/dt=2020-09-02 \
--target-dir /user/hive/warehouse/test.db/stu/dt=2020-09-03 \
--jar-file /opt/apps/code/stu/Stu.jar \
--class-name Stu \
--merge-key id


## 将合并好的数据，导入全量表的2020-09-03分区
hive> alter table stu add partition(dt='2020-09-03') location '/user/hive/warehouse/test.db/stu/dt=2020-09-03';


# 4.根据20220-09-03的增量，和2020-09-02的拉链表，生成2020-09-03的拉链表


--假设 2020-09-02的拉链表如下：
0,'xx',10,'2020-09-01 15:34:55.0',2020-09-01,9999-12-31

-- 2020-09-03的增量数据：
| 1       | aa        | 18       | 2020-09-01 15:34:55.0  | 2020-09-03  |
| 2       | bbb       | 38       | 2020-09-03 13:35:08.0  | 2020-09-03  |
| 3       | cc        | 24       | 2020-09-02 14:35:14.0  | 2020-09-03  |
| 4       | ddd       | 29       | 2020-09-03 10:40:17.0  | 2020-09-03  |
| 5       | ee        | 28       | 2020-09-02 14:40:29.0  | 2020-09-03  |
| 6       | gg        | 22       | 2020-09-03 13:46:23.0  | 2020-09-03  |
| 7       | jj        | 24       | 2020-09-03 16:46:36.0  | 2020-09-03  |
| 8       | 胡磊        | 33     | 2020-09-03 17:15:11.0  | 2020-09-03  |
+---------+-----------+----------+------------------------+-------------+


insert into table test.stu_zip partition(dt='2020-09-03')

select 
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
case when a.end_dt='9999-12-31' and b.id  is not  null then a.dt
else a.end_dt 
end as end_dt

from
stu_zip a  left join lalian b on 
a.id=b.id
and a.dt='2020-09-02' and b.dt='2020-09-03'
union all
select
id,
name,
age,
update_time,
dt as start_dt,
'9999-12-31' as end_dt
from
stu_zip where dt='2020-09-03'  order by id
alter table stu_zip drop partition(dt='2020-09-02');

2号拉链
1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31


3号增量
1,ab
3,cc


3号拉链：
1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,2020-09-02
1,ab,2020-09-03,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31
3,cc,2020-09-03,9999-12-31
drop table  lalian2;
CREATE TABLE test.lalian2(
id    int,
name  string,
int_time  string,
out_time  string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;
 load data local  inpath '/root/lalian2.txt' into table  test.lalian2 partition   (dt='2020-09-03');

drop table  lalian;
CREATE TABLE test.lalian(
id    int,
name  string,
int_time  string,
out_time  string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;
load data local  inpath '/root/lalian1.txt' into table  test.lalian partition(dt='2020-09-02');


select 
a.*,
b.*
from
lalian
 a
left join  
lalian2   b  on a.id=b.id

+-------+---------+-------------+-------------+-------------+-------+---------+-------------+-------------+-------------+
| a.id  | a.name  | a.int_time  | a.out_time  |    a.dt     | b.id  | b.name  | b.int_time  | b.out_time  |    b.dt     |
+-------+---------+-------------+-------------+-------------+-------+---------+-------------+-------------+-------------+
| 1     | ae      | 2020-08-25  | 2020-08-31  | 2020-09-02  | 1     | ab      | NULL        | NULL        | 2020-09-03  |
| 1     | aa      | 2020-09-01  | 9999-12-31  | 2020-09-02  | 1     | ab      | NULL        | NULL        | 2020-09-03  |
| 2     | bb      | 2020-09-01  | 2020-09-01  | 2020-09-02  | NULL  | NULL    | NULL        | NULL        | NULL        |
| 2     | ba      | 2020-09-02  | 9999-12-31  | 2020-09-02  | NULL  | NULL    | NULL        | NULL        | NULL        |
+-------+---------+-------------+-------------+-------------+-------+---------+-------------+-------------+-------------+

+-------+---------+-------------+-------------+
| a.id  | a.name  | a.int_time  |  out_time   |
+-------+---------+-------------+-------------+
| 1     | ae      | 2020-08-25  | 2020-08-31  |
| 1     | aa      | 2020-09-01  | 2020-09-02  |
| 2     | bb      | 2020-09-01  | 2020-09-01  |
| 2     | ba      | 2020-09-02  | 9999-12-31  |
+-------+---------+-------------+-------------

 

 +---------+-----------+---------------+---------------+
| _u1.id  | _u1.name  | _u1.int_time  | _u1.out_time  |
+---------+-----------+---------------+---------------+
| 1       | ae        | 2020-08-25    | 2020-08-31    |
| 1       | aa        | 2020-09-01    | 2020-09-02    |
| 1       | ab        | 2020-09-03    | 9999-12-31    |
| 2       | bb        | 2020-09-01    | 2020-09-01    |
  2       | ba        | 2020-09-02    | 9999-12-31    |
| 3       | cc        | 2020-09-03    | 9999-12-31    |

+---------+-----------+---------------+---------------+





insert into table test.stu_zip partition(dt='2020-09-03')
select 
a.id,a.name,a.int_time,
case when a.out_time='9999-12-31' and b.id  is not  null then a.dt
else a.out_time 
end as out_time
from
stu
 a
left join  
stu_newdata   b  on a.id=b.id
and a.dt='2020-09-02' and b.dt='2020-09-03'
UNION  all
select
id as id,
name as name,
dt as int_time,
'9999-12-31' as out_time
from
stu_newdata
where dt='2020-09-03'  order by id



insert into table test.stu_zip partition(dt='2020-09-03')

select 
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
case when a.end_dt='9999-12-31' and b.id  is not  null then a.dt
else a.end_dt 
end as end_dt

from
stu_zip a  left join lalian b on 
a.id=b.id
and a.dt='2020-09-02' and b.dt='2020-09-03'
union all
select
id,
name,
age,
update_time,
dt as start_dt,
'9999-12-31' as end_dt
from
stu_zip where dt='2020-09-03'  order by id
alter table stu_zip drop partition(dt='2020-09-02');

	
 




1,ae,2020-08-25,2020-08-31
1,aa,2020-09-01,2020-09-02
1,ab,2020-09-03,9999-12-31
2,bb,2020-09-01,2020-09-01
2,ba,2020-09-02,9999-12-31
3,cc,2020-09-03,9999-12-31

-- 拉链表实战

-- 1.创建一张用于存储每日增量数据的表
drop table test.stu_newdata;
CREATE TABLE test.stu_newdata(
id    int,
name  string,
age   string,
update_time  timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;

-- 2.创建一张全量每日快照表
drop table test.stu;
CREATE TABLE test.stu(
id    int,
name  string,
age   string,
update_time  timestamp
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;

-- 3. 创建一张拉链表
drop table test.stu_zip;
CREATE TABLE test.stu_zip(
id    int,
name  string,
age   string,
update_time  timestamp,
start_dt    string,    -- 记录有效起始日
end_dt      string     -- 记录有效结束日
)
partitioned by (dt string)
row format delimited fields terminated by '\001'
stored as textfile
;

-- 操作手册
-- 操作初始日期：2020-09-03

# 1. 利用sqoop，初次导入2020-09-03日，业务库中的表
bin/sqoop import \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123   \
--table  stu \
--target-dir '/sqoopdata/incr/stu/2020-09-03'  \
--incremental lastmodified \
--check-column update_time \
--last-value '2020-09-01 00:00:00'  \
--fields-terminated-by '\001' \
--as-textfile   \
-m 1


# 2. 将本次导入的增量数据，load到hive的增量表中
load data inpath '/sqoopdata/incr/stu/2020-09-03' into table test.stu_newdata partition(dt='2020-09-03');




# 3.生成2020-09-03日的全量表
bin/sqoop codegen \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123   \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--fields-terminated-by '\001'

## 模拟准备全量表的2020-09-02分区
insert into table stu partition(dt='2020-09-02')
select 0,'xx',10,'2020-09-01 15:34:55.0'
;


## 将增量表的2020-09-03数据跟全量表的2020-09-02，合并成全量表的2020-09-03数据
bin/sqoop merge \
--new-data /user/hive/warehouse/test.db/stu_newdata/dt=2020-09-03 \
--onto /user/hive/warehouse/test.db/stu/dt=2020-09-02 \
--target-dir /user/hive/warehouse/test.db/stu/dt=2020-09-03 \
--jar-file /opt/apps/code/stu/Stu.jar \
--class-name Stu \
--merge-key id


## 将合并好的数据，导入全量表的2020-09-03分区
hive> alter table stu add partition(dt='2020-09-03') location '/user/hive/warehouse/test.db/stu/dt=2020-09-03';


# 4.根据20220-09-03的增量，和2020-09-02的拉链表，生成2020-09-03的拉链表


--假设 2020-09-02的拉链表如下：
insert into table test.stu_zip partition(dt='2020-09-02')
select
0,'xx',10,'2020-09-01 15:34:55.0','2020-09-01','9999-12-31'
;

-- 2020-09-03的增量数据：
| 1       | aa        | 18       | 2020-09-01 15:34:55.0  | 2020-09-03  |
| 2       | bbb       | 38       | 2020-09-03 13:35:08.0  | 2020-09-03  |
| 3       | cc        | 24       | 2020-09-02 14:35:14.0  | 2020-09-03  |
| 4       | ddd       | 29       | 2020-09-03 10:40:17.0  | 2020-09-03  |
| 5       | ee        | 28       | 2020-09-02 14:40:29.0  | 2020-09-03  |
| 6       | gg        | 22       | 2020-09-03 13:46:23.0  | 2020-09-03  |
| 7       | jj        | 24       | 2020-09-03 16:46:36.0  | 2020-09-03  |
| 8       | 胡磊      | 33       | 2020-09-03 17:15:11.0  | 2020-09-03  |
+---------+-----------+----------+------------------------+-------------+



-- 2020-09-03 拉链表更新
with zip as (
select
id,
name,
age,
update_time,
start_dt,
end_dt,
dt
from stu_zip  where dt='2020-09-02'
),
newdata as (
select
id,
name,
age,
update_time,
dt
from stu_newdata where dt='2020-09-03'
)

INSERT INTO TABLE test.stu_zip PARTITION (dt='2020-09-03')
SELECT
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
if(a.end_dt='9999-12-31' and b.id is not null,a.dt,a.end_dt) as end_dt
FROM zip a  left join newdata b on a.id=b.id

UNION ALL

SELECT
id,
name,
age,
update_time,
cast(to_date(update_time) as string) as start_dt,
'9999-12-31' as end_dt
FROM newdata
;





-- 验证准确性
1. 在mysql中的stu表中，再更新几条2020-09-04的数据
+----------------------------------+
| 0刘爽552020-09-04 15:34:55.0    |
| 7叶成岭222020-09-04 16:46:36.0   |
| 9闫丽站182020-09-04 09:34:30.0   |
| 10庞艳勋222020-09-04 09:34:58.0  |
+----------------------------------+


2. 用sqoop导入2020-09-04的增量数据
bin/sqoop import \
--connect jdbc:mysql://doitedu01:3306/realtimedw?characterEncoding=utf8\&useUnicode=true \
--username root \
--password ABC123abc.123   \
--table  stu \
--target-dir '/sqoopdata/incr/stu/2020-09-04'  \
--incremental lastmodified \
--check-column update_time \
--last-value '2020-09-04 00:00:00'  \
--fields-terminated-by '\001' \
--as-textfile   \
-m 1


3. 将增量数据文件，load入hive中的stu增量数据表
load data inpath '/sqoopdata/incr/stu/2020-09-04' into table test.stu_newdata partition(dt='2020-09-04');



4. 更新2020-09-04的拉链表
with zip as (
select
id,
name,
age,
update_time,
start_dt,
end_dt,
dt
from stu_zip  where dt='2020-09-03'
),
newdata as (
select
id,
name,
age,
update_time,
dt
from stu_newdata where dt='2020-09-04'
)

INSERT INTO TABLE test.stu_zip PARTITION (dt='2020-09-04')
SELECT
a.id,
a.name,
a.age,
a.update_time,
a.start_dt,
if(a.end_dt='9999-12-31' and b.id is not null,a.dt,a.end_dt) as end_dt
FROM zip a  left join newdata b on a.id=b.id

UNION ALL

SELECT
id,
name,
age,
update_time,
cast(to_date(update_time) as string) as start_dt,
'9999-12-31' as end_dt
FROM newdata
;