1.基本概念
1.增量表:记录更新周期内新增的数据,即在原表中数据的基础上新增本周期内产生的新数据;
2.全量表:记录更新周期内的全量数据,无论数据是否有变化都需要记录;
3.数仓一般分4层 ods层-》明细层-〉聚合层-》应用层
2.需求及设计
对订单表进行数仓模型设计
ods层 -》 明细层 dl -〉 聚合层 al -》 应用层 al
order1 全量倒入order2_d1_snapshot order3_count_reduce order3_count_reduce
增量倒入order2_h1_di
3.执行步骤
1.准备mysql表order
字段:orderid userid goodname num money datatime
2.创建hive表order1 ods层
3.mysql倒入hive表order1
4.创建hive表 一天内数据:order2_d1_snapshot 1小时内数据:order2_h1_di 明细层
5.order1全量倒入order2_d1_snapshot 增量倒入order2_h1_di
6.创建hive表 order3_count_reduce 聚合层
7.查询明细层 人在货品上消费额 插入 order3_count_reduce
8.创建hive表 order_count_show 应用层
9.查询品类是B的 order3_count_reduce 人在货品上消费额 插入 order_count_show
10.在mysql中创建 order_count_show_mysql
11.将order_count_show数据导入mysql表order_count_show_mysql
4.详情
1.准备mysql表order
字段:orderid userid goodname num money datatime
CREATE TABLE order
(
orderid
varchar(100) NOT NULL,
userid
varchar(100) DEFAULT NULL,
aa
varchar(100) DEFAULT NULL,
goodname
varchar(100) DEFAULT NULL,
num
bigint DEFAULT NULL,
money
double(20,0) DEFAULT NULL,
datatime
datetime DEFAULT NULL,
PRIMARY KEY (orderid
)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
测试数据
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0001’, ‘w1’, ‘1’, ‘good1’, 2, 10, ‘2022-09-03 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0002’, ‘w2’, ‘2’, ‘good1’, 3, 20, ‘2022-09-03 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0003’, ‘w1’, ‘3’, ‘good1’, 1, 30, ‘2022-09-03 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0004’, ‘w2’, ‘4’, ‘good1’, 2, 40, ‘2022-09-02 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0005’, ‘w1’, ‘5’, ‘good1’, 2, 10, ‘2022-09-02 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0006’, ‘w2’, ‘6’, ‘good2’, 3, 20, ‘2022-09-02 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0007’, ‘w1’, ‘7’, ‘good2’, 1, 30, ‘2022-09-01 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0008’, ‘w2’, ‘8’, ‘good2’, 2, 50, ‘2022-09-01 17:09:50’);
INSERT INTO test
.order
(orderid
, userid
, aa
, goodname
, num
, money
, datatime
) VALUES (‘0009’, ‘w1’, ‘9’, ‘good2’, 3, 20, ‘2022-09-01 17:09:50’);
2.创建hive表order1 ods层
CREATE TABLE order1
(
orderid
string,
userid
string,
aa
string,
goodname
string,
num
bigint,
money
double ,
datatime
timestamp
)partitioned by(dt string comment ‘日期分区’);
3.mysql倒入hive表order1
sqoop import
–connect jdbc:mysql://localhost:3306/test
–username root
–password 123456
–table order
-m 1
–hive-import
–hive-table hivetest.order1
–hive-partition-key ‘dt’
–hive-partition-value ‘202209041721’
//–create-hive-table
// date_sub(current_date,1)
如报错执行删除之前的错误目录 hdfs dfs -rm -r /user/wanghao/order
4.创建hive表 全量数据:order2_d1_df 1天内数据:order2_h1_di 明细层
CREATE TABLE order2_d1_df
(
orderid
string ,
userid
string,
aa
string,
goodname
string,
num
bigint,
money
double,
datatime
timestamp
) partitioned by(dt string);
CREATE TABLE order2_d1_di
(
orderid
string,
userid
string,
aa
string,
goodname
string,
num
bigint,
money
double,
datatime
timestamp
) partitioned by(dt string);
5.order1全量倒入order2_d1_df 增量倒入order2_d1_di
hive
全
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert overwrite table
order2_d1_df partition(dt)
select orderid,userid,aa,goodname,num,money,datatime,date_sub(current_date,1) as dt from order1 where datatime<current_date;
增
初始化
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert overwrite table
order2_d1_di partition(dt)
select orderid,userid,aa,goodname,num,money,datatime , date_sub(current_date,1) as dt from order1 where datatime<current_date ;
按天任务
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
insert overwrite table
order2_d1_di partition(dt)
select orderid,userid,aa,goodname,num,money,datatime , date_sub(current_date,1) as dt from order1 where datatime<current_date and datatime>=date_sub(current_date,1);
6.创建hive表 order3_count_reduce 聚合层
CREATE TABLE order3_count_reduce
(
userid
string,
goodname
string,
aa
string,
total_num
bigint,
total_gmv
double,
num
bigint,
gmv
double
) partitioned by(dt string);
7.查询明细层 人在货品上消费数和消费额 插入 order3_count_reduce
hive
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
insert overwrite table order3_count_reduce partition(dt)
select
userid,
goodname,
aa,
sum(total_num) as total_num,
sum(total_gmv)as total_gmv,
sum(num) as num,
sum(gmv) as gmv,
date_sub(current_date,1) as dt
from
(
select
userid,
goodname,
aa,
sum(num) as total_num,
sum(money)as total_gmv,
0 as num,
0 as gmv
from order2_d1_df
where dt = date_sub(current_date,1)
group by userid , goodname,aa
union all
select
userid,
goodname,
aa,
0 as total_num,
0 as total_gmv,
sum(num) as num,
sum(money) as gmv
from order2_d1_di
where dt =date_sub(current_date,1)
group by userid , goodname,aa
)t
group by
userid,
goodname,
aa
//where dt = date_sub(current_date,1) //'{{ds}}'
8.创建hive表 order_count_show 应用层
CREATE TABLE order_count_show
(
goodname
string,
total_num
bigint,
total_gmv
double,
num
bigint,
gmv
double
) ;
9.查询品类是B的 order3_count_reduce 人在货品上消费额 插入 order_count_show
hive
insert overwrite table order_count_show
select
goodname,
sum(total_num),
sum(total_gmv),
sum(num),
sum(gmv)
from order3_count_reduce
where dt =date_sub(current_date,1)
group by goodname
// where dt =date_sub(current_date,1) //‘{{ds}}’
10.在mysql中创建 order_count_show_mysql
CREATE TABLE order_count_show_mysql
(
goodname
varchar(100) DEFAULT NULL,
total_num
varchar(100) DEFAULT NULL,
total_gmv
varchar(100) DEFAULT NULL,
num
bigint(100) DEFAULT NULL,
gmv
double(20,0) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
11.将order_count_show数据导入mysql表order_count_show_mysql
sqoop export
–connect jdbc:mysql://localhost:3306/test
–username root
–password 123456
–table order_count_show_mysql
–export-dir /data/hive/warehouse/hivetest.db/order_count_show
-m 1
–fields-terminated-by ‘\001’
–lines-terminated-by ‘\t’
5.用airflow 写执行脚本
#!/usr/bin/env python
from airflow import DAG
from airflow.example_dags.example_bash_operator import dag
from airflow.operators.bash import BashOperator
from airflow.providers.apache.hive.operators.hive import HiveOperator
from datetime import datetime, timedelta
default_args = {
‘owner’: ‘wh3’,
‘start_date’: datetime(2022, 9, 6),
‘retries’: 1,
‘retry_delay’: timedelta(minutes=5)
}
dag = DAG(
dag_id = ‘wh3’,
default_args = default_args,
schedule_interval = timedelta(days=1)
)
sqoop1 = BashOperator(
task_id=‘sqoop1’,
bash_command='/Users/wanghao/Documents/tools_package/air/sqoop1.sh ',
dag=dag
)
to2fsql = HiveOperator(
task_id=‘to2fsql’,
hive_cli_conn_id=“hiveconid”,
hql=‘to2fsql.hql’,
hiveconf_jinja_translate=True,
dag = dag
)
to2isql1 = HiveOperator(
task_id=‘to2isql1’,
hive_cli_conn_id=“hiveconid”,
hql=‘to2isql1.hql’,
hiveconf_jinja_translate=True,
dag = dag
)
to2isql2 = HiveOperator(
task_id=‘to2isql2’,
hive_cli_conn_id=“hiveconid”,
hql=‘to2isql2.hql’,
hiveconf_jinja_translate=True,
dag = dag
)
to3sql = HiveOperator(
task_id=‘to3sql’,
hive_cli_conn_id=“hiveconid”,
hql= ‘to3sql.hql’,
hiveconf_jinja_translate=True,
dag = dag
)
to4sql = HiveOperator(
task_id=‘to4sql’,
hive_cli_conn_id=“hiveconid”,
hql=‘to4sql.hql’,
hiveconf_jinja_translate=True,
dag = dag
)
sqoop1 >> to2fsql >>[to2isql1,to2isql2] >>to3sql >>to4sql
如图
点击单个节点 点log可看日志,点击clear可以重新跑。
hql粘贴到.hql文件中查询插入的表名前需加库名 如: hivetest.order3_count_reduce ( hivetest为库名)
代码见
hadoop 练习项目:https://github.com/QingYang12/hadooptest