1.在mysql创建表导入数据
mysql>create table `student_test` (
`id` int (10),
`name` varchar (150),
`age` int (10),
`create_time` datetime ,
`lastmodify_time` datetime
);
mysql>
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('1','wangwu','21','2023-01-10 09:35:25','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('2','zhangsan','21','2023-01-10 09:35:26','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('3','lisi','20','2023-01-10 08:45:23','2023-01-11 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('4','xiaohong','30','2023-01-10 09:35:25','2023-01-10 ');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('5','feifei','22','2023-01-10 09:35:25','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('6','zhaoliu','22','2023-01-10 09:35:25','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('7','test','22','2023-01-10 08:46:23','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('8','aaa','22','2023-01-10 08:46:23','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('9','bbb','22','2023-01-10 08:46:27','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('10','ccc','23','2023-01-10 08:48:27','2023-01-11 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('11','ddd','24','2023-01-10 08:49:28','2023-01-10 13:11:59');
insert into `student_test` (`id`, `name`, `age`, `create_time`, `lastmodify_time`) values('12','eee','25','2023-01-10 11:45:29','2023-01-10 14:11:59');
2.hive:创建ods层的原始数据增量导入表 (把操作系统数据几乎无处理地存放在数据仓库中(可以做一部分 clean 工作))
hive>create table ods_student_test(
id int,
name string,
age int,
create_time string,
lastmodify_time string
)
partitioned by (dt string)
row format delimited
fields terminated by ',';
3.创建dwd层的全量表(DWD 和 DWS,主要采用维度模型方法建设,多采用维度退化的方法,及将维度退化到事实表,减少事实表和维表的关联,提高事实表的易用性;在汇总层,加强指标的维度退化,采用宽表化手段构建公共指标数据层,提升公共指标的复用性,减少重复加工。) dwd_student_test历史表
hive>create table dwd_student_test(
id int,
name string,
age int,
create_time string,
lastmodify_time string
)
partitioned by (dt string)
row format delimited
fields terminated by ',';
4.sqoop连接mysql数据导出至hdfs上(10日)
---导入数据并修改元数据,数据和表关联
linux>sqoop import \
--connect jdbc:mysql://mysql_IP:3306/testdb \
--username root \
--password 123 \
--table student_test \ #//表名
--target-dir '/user/hive/warehouse/testdb/student_test/dt=2023-01-10' \ #//hdfs存放位置 没有此位置的话会自动创建 /user/hive/warehouse/testdb/student_test/dt=2023-01-10'
--incremental lastmodified \ #//这个不是字段 增量模式 最后修改的
--check-column lastmodify_time \ #//指定检查字段
--last-value '2023-01-09 23:59:59' \ #//指定时间,如果导入10日的数据则指定9日23:59:59
--where "lastmodify_time<'2023-01-11 00:00:00'" \ #//指定条件,如果导入10日的数据字段值<11日0点
--fields-terminated-by ',' \ #//以“,”号分开
-m 1 \ #//map任务的数量
5.hive加载hdfs上的数据并指定分区(10日)
hive>load data inpath'/user/hive/warehouse/testdb/student_test/dt=2023-01-10' INTO TABLE ods_student_test partition (dt='2023-01-10');
6.查看数据与分区
hive>show partitions ods_student_test; #//查看分区
hive>alter table ods_student_test drop if exists partition(dt='2023-01-10'); #//删除分区
hive>select * from ods_student_test;
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| ods_student_test.id | ods_student_test.name | ods_student_test.age | ods_student_test.create_time | ods_student_test.lastmodify_time | ods_student_test.dt |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 1 | wangwu | 21 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 2 | zhangsan | 21 | 2023-01-10 09:35:26.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 4 | xiaohong | 30 | 2023-01-10 09:35:25.0 | 2023-01-10 00:00:00.0 | 2023-01-10 |
| 5 | feifei | 22 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 6 | zhaoliu | 22 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 7 | test | 22 | 2023-01-10 08:46:23.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 8 | aaa | 22 | 2023-01-10 08:46:23.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 9 | bbb | 22 | 2023-01-10 08:46:27.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 11 | ddd | 24 | 2023-01-10 08:49:28.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 12 | eee | 25 | 2023-01-10 11:45:29.0 | 2023-01-10 14:11:59.0 | 2023-01-10 |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
10 rows selected (0.214 seconds)
7.更新历史表dwd_student_test(10日)
hive>with temp as(
select
id,name,age,create_time,lastmodify_time
from ods_student_test where dt='2023-01-10'
union all
select
id,name,age,create_time,lastmodify_time
from dwd_student_test where dt='2023-01-09'
),
result as(
select
id,
name,
age,
create_time,
lastmodify_time,
row_number() over(partition by id order by lastmodify_time desc) as rn
from temp
)
insert overwrite table dwd_student_test partition(dt='2023-01-10')
select
id,
name,
age,
create_time,
lastmodify_time
from result
where rn=1;
8.sqoop连接mysql数据导出至hdfs上(11日)
---导入数据并修改元数据,数据和表关联
linux>sqoop import \
--connect jdbc:mysql://mysql_IP:3306/testdb \
--username root \
--password 123 \
--table student_test \ #//表名
--target-dir '/user/hive/warehouse/testdb/student_test/dt=2023-01-11' \ #//hdfs存放位置 没有此位置的话会自动创建 /user/hive/warehouse/testdb/student_test/dt=2023-01-11'
--incremental lastmodified \ #//这个不是字段 增量模式 最后修改的
--check-column lastmodify_time \ #//指定检查字段
--last-value '2023-01-10 23:59:59' \ #//指定时间,如果导入11日的数据则指定10日23:59:59
--where "lastmodify_time<'2023-01-12 00:00:00'" \ #//指定条件,如果导入11日的数据字段值<12日0点
--fields-terminated-by ',' \ #//以“,”号分开
-m 1 \ #//map任务的数量
9.hive加载hdfs上的数据并指定分区(11日)
load data inpath'/user/hive/warehouse/testdb/student_test/dt=2023-01-11' INTO TABLE ods_student_test partition (dt='2023-01-11');
10.hive查看指定分区数据
hive> select * from ods_student_test dt='2023-01-11';
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| ods_student_test.id | ods_student_test.name | ods_student_test.age | ods_student_test.create_time | ods_student_test.lastmodify_time | ods_student_test.dt |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 3 | lisi | 20 | 2023-01-10 08:45:23.0 | 2023-01-11 13:11:59.0 | 2023-01-11 |
| 10 | ccc | 23 | 2023-01-10 08:48:27.0 | 2023-01-11 13:11:59.0 | 2023-01-11 |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
hive>select * from ods_student_test;
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| ods_student_test.id | ods_student_test.name | ods_student_test.age | ods_student_test.create_time | ods_student_test.lastmodify_time | ods_student_test.dt |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 1 | wangwu | 21 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 2 | zhangsan | 21 | 2023-01-10 09:35:26.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 4 | xiaohong | 30 | 2023-01-10 09:35:25.0 | 2023-01-10 00:00:00.0 | 2023-01-10 |
| 5 | feifei | 22 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 6 | zhaoliu | 22 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 7 | test | 22 | 2023-01-10 08:46:23.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 8 | aaa | 22 | 2023-01-10 08:46:23.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 9 | bbb | 22 | 2023-01-10 08:46:27.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 11 | ddd | 24 | 2023-01-10 08:49:28.0 | 2023-01-10 13:11:59.0 | 2023-01-10 |
| 12 | eee | 25 | 2023-01-10 11:45:29.0 | 2023-01-10 14:11:59.0 | 2023-01-10 |
| 3 | lisi | 20 | 2023-01-10 08:45:23.0 | 2023-01-11 13:11:59.0 | 2023-01-11 |
| 10 | ccc | 23 | 2023-01-10 08:48:27.0 | 2023-01-11 13:11:59.0 | 2023-01-11 |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
12 rows selected (0.125 seconds)
11.更新历史表dwd_student_test(11日)
hive>with temp as(
select
id,name,age,create_time,lastmodify_time
from ods_student_test where dt='2023-01-11'
union all
select
id,name,age,create_time,lastmodify_time
from dwd_student_test where dt='2023-01-10'
),
result as(
select
id,
name,
age,
create_time,
lastmodify_time,
row_number() over(partition by id order by lastmodify_time desc) as rn
from temp
)
insert overwrite table dwd_student_test partition(dt='2023-01-11')
select
id,
name,
age,
create_time,
lastmodify_time
from result
where rn=1;
12.查看历史表11日数据
hive>select * from dwd_student_test where dt='2023-01-11';
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| dwd_student_test.id | dwd_student_test.name | dwd_student_test.age | dwd_student_test.create_time | dwd_student_test.lastmodify_time | dwd_student_test.dt |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
| 1 | wangwu | 21 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 2 | zhangsan | 21 | 2023-01-10 09:35:26.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 3 | lisi | 20 | 2023-01-10 08:45:23.0 | 2023-01-11 13:11:59.0 | 2023-01-11 |
| 4 | xiaohong | 30 | 2023-01-10 09:35:25.0 | 2023-01-10 00:00:00.0 | 2023-01-11 |
| 5 | feifei | 22 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 6 | zhaoliu | 22 | 2023-01-10 09:35:25.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 7 | test | 22 | 2023-01-10 08:46:23.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 8 | aaa | 22 | 2023-01-10 08:46:23.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 9 | bbb | 22 | 2023-01-10 08:46:27.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 10 | ccc | 23 | 2023-01-10 08:48:27.0 | 2023-01-11 13:11:59.0 | 2023-01-11 |
| 11 | ddd | 24 | 2023-01-10 08:49:28.0 | 2023-01-10 13:11:59.0 | 2023-01-11 |
| 12 | eee | 25 | 2023-01-10 11:45:29.0 | 2023-01-10 14:11:59.0 | 2023-01-11 |
+----------------------+------------------------+-----------------------+-------------------------------+-----------------------------------+----------------------+--+
12 rows selected (0.079 seconds)
增量导入脚本
linux>vi increment.sh
#!/bin/bash
export SQOOP_HOME=/opt/install/sqoop/
export HIVE_HOME=/opt/install/hive/
DT_INCR=`date -d'-1 day' +%Y-%m-%d`
if [ $1 ]
then
DT_INCR=$1
fi
${SQOOP_HOME}/bin/sqoop import \
--connect jdbc:mysql://mysql_IP:3306/库名 \
--username root \
--password 123 \
--table 表名 \
--target-dir "/user/hive/warehouse/testdb/student_test/dt=${DT_INCR}" \
--incremental lastmodified \
--check-column lastmodify_time \
--last-value "${DT_INCR} 00:00:00" \
--where "date(lastmodify_time)='${DT_INCR}'" \
--fields-terminated-by ',' \
--split-by id \
-m 2 \
##
if [ $? -eq 0 ]
then
echo "import sueccess!"
echo "准备加载到hive表分区:dt=${DT_INCR}"
${HIVE_HOME}/bin/hive -e "alter table test.student_incr add partition(dt='${DT_INCR}')"
if [ $? -eq 0 ]
then
echo "load data success"
else
echo "fail"
fi
else
echo "data transfomr fail"
fi
增量合并脚本
linux>vi Incremental_merge.sh
#!/bin/bash
export SQOOP_HOME=/opt/install/sqoop/
export HIVE_HOME=/opt/install/hive/
DT_WHOLE=`date -d'-2 day' +%Y-%m-%d`
DT_INCR=`date -d'-1 day' +%Y-%m-%d`
if [[ $1 && $2 ]]
then
DT_WHOLE=$1
DT_INCR=$2
fi
##
${HIVE_HOME}/bin/hive -e "
with temp as(
select
id,name,age,create_time,lastmodify_time
from test.student_incr where dt='${DT_INCR}'
union all
select
id,name,age,create_time,lastmodify_time
from test.dwd_student where dt='${DT_WHOLE}'
),
result as(
select
id,
name,
age,
create_time,
lastmodify_time,
row_number() over(partition by id order by lastmodify_time desc) as rn
from temp
)
insert into table test.dwd_student partition(dt='${DT_INCR}')
select
id,
name,
age,
create_time,
lastmodify_time
from result
where rn=1
"
if [ $? -eq 0 ]
then
echo "meger success"
else
echo "fail"
fi