hive 增量导入、更新历史表、增量导入与合并脚本

1.在mysql创建表导入数据

mysql>create table `student_test` (
	`id` int (10),
	`name` varchar (150),
	`age` int (10),
	`create_time` datetime ,
	`lastmodify_time` datetime 
); 
mysql>
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('1','wangwu','21','2023-01-10 09:35:25','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('2','zhangsan','21','2023-01-10 09:35:26','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('3','lisi','20','2023-01-10 08:45:23','2023-01-11 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('4','xiaohong','30','2023-01-10 09:35:25','2023-01-10 ');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('5','feifei','22','2023-01-10 09:35:25','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('6','zhaoliu','22','2023-01-10 09:35:25','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('7','test','22','2023-01-10 08:46:23','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('8','aaa','22','2023-01-10 08:46:23','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('9','bbb','22','2023-01-10 08:46:27','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('10','ccc','23','2023-01-10 08:48:27','2023-01-11 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('11','ddd','24','2023-01-10 08:49:28','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('12','eee','25','2023-01-10 11:45:29','2023-01-10 14:11:59');

2.hive:创建ods层的原始数据增量导入表    (把操作系统数据几乎无处理地存放在数据仓库中(可以做一部分 clean 工作))

hive>create table ods_student_test(
id int,
name string,
age int,
create_time string,
lastmodify_time string
)
partitioned by (dt string)
row format delimited
fields terminated by ',';

3.创建dwd层的全量表(DWD 和 DWS,主要采用维度模型方法建设,多采用维度退化的方法,及将维度退化到事实表,减少事实表和维表的关联,提高事实表的易用性;在汇总层,加强指标的维度退化,采用宽表化手段构建公共指标数据层,提升公共指标的复用性,减少重复加工。) dwd_student_test历史表

hive>create table dwd_student_test(
id int,
name string,
age int,
create_time string,
lastmodify_time string
)
partitioned by (dt string)
row format delimited
fields terminated by ',';

4.sqoop连接mysql数据导出至hdfs上(10日)

---导入数据并修改元数据,数据和表关联
linux>sqoop import \
--connect jdbc:mysql://mysql_IP:3306/testdb \
--username root \
--password 123 \
--table student_test \			#//表名
--target-dir '/user/hive/warehouse/testdb/student_test/dt=2023-01-10'  \  #//hdfs存放位置 没有此位置的话会自动创建  /user/hive/warehouse/testdb/student_test/dt=2023-01-10'
--incremental lastmodified \		#//这个不是字段 增量模式 最后修改的
--check-column lastmodify_time \    #//指定检查字段
--last-value '2023-01-09 23:59:59'  \   #//指定时间,如果导入10日的数据则指定9日23:59:59
--where "lastmodify_time<'2023-01-11 00:00:00'" \  #//指定条件,如果导入10日的数据字段值<11日0点
--fields-terminated-by ',' \    #//以“,”号分开
-m 1   \            #//map任务的数量

5.hive加载hdfs上的数据并指定分区(10日)

hive>load data  inpath'/user/hive/warehouse/testdb/student_test/dt=2023-01-10' INTO TABLE ods_student_test partition (dt='2023-01-10');

6.查看数据与分区

hive>show partitions ods_student_test;    #//查看分区
hive>alter table ods_student_test drop if exists partition(dt='2023-01-10');    #//删除分区
hive>select * from ods_student_test;
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| ods_student_test.id  | ods_student_test.name  | ods_student_test.age  | ods_student_test.create_time  | ods_student_test.lastmodify_time  | ods_student_test.dt  |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 1                    | wangwu                 | 21                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 2                    | zhangsan               | 21                    | 2023-01-10 09:35:26.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 4                    | xiaohong               | 30                    | 2023-01-10 09:35:25.0         | 2023-01-10 00:00:00.0             | 2023-01-10           |
| 5                    | feifei                 | 22                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 6                    | zhaoliu                | 22                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 7                    | test                   | 22                    | 2023-01-10 08:46:23.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 8                    | aaa                    | 22                    | 2023-01-10 08:46:23.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 9                    | bbb                    | 22                    | 2023-01-10 08:46:27.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 11                   | ddd                    | 24                    | 2023-01-10 08:49:28.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 12                   | eee                    | 25                    | 2023-01-10 11:45:29.0         | 2023-01-10 14:11:59.0             | 2023-01-10           |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
10 rows selected (0.214 seconds)

7.更新历史表dwd_student_test(10日)

hive>with temp as(
select 
id,name,age,create_time,lastmodify_time
from ods_student_test where dt='2023-01-10'
union all
select 
id,name,age,create_time,lastmodify_time
from dwd_student_test where dt='2023-01-09'
),
result as(
select 
id,
name,
age,
create_time,
lastmodify_time,
row_number() over(partition by id order by lastmodify_time desc) as rn
from temp
)
insert overwrite table dwd_student_test partition(dt='2023-01-10')
select 
id,
name,
age,
create_time,
lastmodify_time
from result
where rn=1;

8.sqoop连接mysql数据导出至hdfs上(11日)

---导入数据并修改元数据,数据和表关联
linux>sqoop import \
--connect jdbc:mysql://mysql_IP:3306/testdb \
--username root \
--password 123 \
--table student_test \			#//表名
--target-dir '/user/hive/warehouse/testdb/student_test/dt=2023-01-11'  \  #//hdfs存放位置 没有此位置的话会自动创建  /user/hive/warehouse/testdb/student_test/dt=2023-01-11'
--incremental lastmodified \		#//这个不是字段 增量模式 最后修改的
--check-column lastmodify_time \    #//指定检查字段
--last-value '2023-01-10 23:59:59'  \   #//指定时间,如果导入11日的数据则指定10日23:59:59
--where "lastmodify_time<'2023-01-12 00:00:00'" \  #//指定条件,如果导入11日的数据字段值<12日0点
--fields-terminated-by ',' \    #//以“,”号分开
-m 1   \            #//map任务的数量

9.hive加载hdfs上的数据并指定分区(11日)

load data  inpath'/user/hive/warehouse/testdb/student_test/dt=2023-01-11' INTO TABLE ods_student_test partition (dt='2023-01-11');

10.hive查看指定分区数据

hive> select * from ods_student_test dt='2023-01-11';
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| ods_student_test.id  | ods_student_test.name  | ods_student_test.age  | ods_student_test.create_time  | ods_student_test.lastmodify_time  | ods_student_test.dt  |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 3                    | lisi                   | 20                    | 2023-01-10 08:45:23.0         | 2023-01-11 13:11:59.0             | 2023-01-11           |
| 10                   | ccc                    | 23                    | 2023-01-10 08:48:27.0         | 2023-01-11 13:11:59.0             | 2023-01-11           |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
hive>select * from ods_student_test;
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| ods_student_test.id  | ods_student_test.name  | ods_student_test.age  | ods_student_test.create_time  | ods_student_test.lastmodify_time  | ods_student_test.dt  |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 1                    | wangwu                 | 21                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 2                    | zhangsan               | 21                    | 2023-01-10 09:35:26.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 4                    | xiaohong               | 30                    | 2023-01-10 09:35:25.0         | 2023-01-10 00:00:00.0             | 2023-01-10           |
| 5                    | feifei                 | 22                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 6                    | zhaoliu                | 22                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 7                    | test                   | 22                    | 2023-01-10 08:46:23.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 8                    | aaa                    | 22                    | 2023-01-10 08:46:23.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 9                    | bbb                    | 22                    | 2023-01-10 08:46:27.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 11                   | ddd                    | 24                    | 2023-01-10 08:49:28.0         | 2023-01-10 13:11:59.0             | 2023-01-10           |
| 12                   | eee                    | 25                    | 2023-01-10 11:45:29.0         | 2023-01-10 14:11:59.0             | 2023-01-10           |
| 3                    | lisi                   | 20                    | 2023-01-10 08:45:23.0         | 2023-01-11 13:11:59.0             | 2023-01-11           |
| 10                   | ccc                    | 23                    | 2023-01-10 08:48:27.0         | 2023-01-11 13:11:59.0             | 2023-01-11           |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
12 rows selected (0.125 seconds)

11.更新历史表dwd_student_test(11日)

hive>with temp as(
select 
id,name,age,create_time,lastmodify_time
from ods_student_test where dt='2023-01-11'
union all
select 
id,name,age,create_time,lastmodify_time
from dwd_student_test where dt='2023-01-10'
),
result as(
select 
id,
name,
age,
create_time,
lastmodify_time,
row_number() over(partition by id order by lastmodify_time desc) as rn
from temp
)
insert overwrite table dwd_student_test partition(dt='2023-01-11')
select 
id,
name,
age,
create_time,
lastmodify_time
from result
where rn=1;

12.查看历史表11日数据

hive>select * from dwd_student_test where dt='2023-01-11';
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| dwd_student_test.id  | dwd_student_test.name  | dwd_student_test.age  | dwd_student_test.create_time  | dwd_student_test.lastmodify_time  | dwd_student_test.dt  |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 1                    | wangwu                 | 21                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 2                    | zhangsan               | 21                    | 2023-01-10 09:35:26.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 3                    | lisi                   | 20                    | 2023-01-10 08:45:23.0         | 2023-01-11 13:11:59.0             | 2023-01-11           |
| 4                    | xiaohong               | 30                    | 2023-01-10 09:35:25.0         | 2023-01-10 00:00:00.0             | 2023-01-11           |
| 5                    | feifei                 | 22                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 6                    | zhaoliu                | 22                    | 2023-01-10 09:35:25.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 7                    | test                   | 22                    | 2023-01-10 08:46:23.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 8                    | aaa                    | 22                    | 2023-01-10 08:46:23.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 9                    | bbb                    | 22                    | 2023-01-10 08:46:27.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 10                   | ccc                    | 23                    | 2023-01-10 08:48:27.0         | 2023-01-11 13:11:59.0             | 2023-01-11           |
| 11                   | ddd                    | 24                    | 2023-01-10 08:49:28.0         | 2023-01-10 13:11:59.0             | 2023-01-11           |
| 12                   | eee                    | 25                    | 2023-01-10 11:45:29.0         | 2023-01-10 14:11:59.0             | 2023-01-11           |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
12 rows selected (0.079 seconds)

增量导入脚本

linux>vi increment.sh
#!/bin/bash
export SQOOP_HOME=/opt/install/sqoop/
export HIVE_HOME=/opt/install/hive/


DT_INCR=`date -d'-1 day' +%Y-%m-%d`

if [ $1 ]
then
DT_INCR=$1
fi

${SQOOP_HOME}/bin/sqoop import \
--connect jdbc:mysql://mysql_IP:3306/库名 \
--username root \
--password 123 \
--table 表名 \
--target-dir "/user/hive/warehouse/testdb/student_test/dt=${DT_INCR}"  \
--incremental lastmodified \
--check-column lastmodify_time \
--last-value "${DT_INCR} 00:00:00"  \
--where "date(lastmodify_time)='${DT_INCR}'"   \
--fields-terminated-by ',' \
--split-by id  \
-m 2   \

##

if [ $? -eq 0 ]
then 
echo "import sueccess!"
echo "准备加载到hive表分区:dt=${DT_INCR}"
${HIVE_HOME}/bin/hive -e "alter table test.student_incr add partition(dt='${DT_INCR}')"


if [ $? -eq 0 ]
then 
echo "load data success"
else
echo "fail"
fi

else
echo "data transfomr fail"
fi

增量合并脚本

linux>vi Incremental_merge.sh
#!/bin/bash

export SQOOP_HOME=/opt/install/sqoop/
export HIVE_HOME=/opt/install/hive/

DT_WHOLE=`date -d'-2 day' +%Y-%m-%d`
DT_INCR=`date -d'-1 day' +%Y-%m-%d`

if [[ $1 && $2 ]]
then
DT_WHOLE=$1
DT_INCR=$2
fi

##


${HIVE_HOME}/bin/hive -e "
with temp as(
select 
id,name,age,create_time,lastmodify_time
from test.student_incr where dt='${DT_INCR}'
union all
select 
id,name,age,create_time,lastmodify_time
from test.dwd_student where dt='${DT_WHOLE}'
),
result as(
select 
id,
name,
age,
create_time,
lastmodify_time,
row_number() over(partition by id order by lastmodify_time desc) as rn
from temp
)
insert into table test.dwd_student partition(dt='${DT_INCR}')
select 
id,
name,
age,
create_time,
lastmodify_time
from result
where rn=1
"


if [ $? -eq 0 ]
then 
echo "meger success"
else
echo "fail"
fi

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值