一、DWD层建表加载数据说明
### --- 要处理的表有两张:订单表、订单产品表。其中:
~~~ 订单表是周期性事实表;为保留订单状态,可以使用拉链表进行处理;
~~~ 订单产品表普通的事实表,用常规的方法进行处理;
~~~ 如果有数据清洗、数据转换的业务需求,ODS => DWD
~~~ 如果没有数据清洗、数据转换的业务需求,保留在ODS,不做任何变化。这个是本项目的处理方式
### --- 订单状态:
~~~ 3:用户拒收
~~~ 2:未付款的订单
~~~ 1:用户取消
~~~ 0:待发货
~~~ 1:配送中
~~~ 2:用户确认收货
~~~ 订单从创建到最终完成,是有时间限制的;业务上也不允许订单在一个月之后,状态仍然在发生变化;
二、DWD层建表加载数据
### --- DWD层建表加载数据
~~~ 与维表不同,订单事实表的记录数非常多
~~~ 订单有生命周期;订单的状态不可能永远处于变化之中(订单的生命周期一般在15天左右)
~~~ 订单是一个拉链表,而且是分区表
~~~ 分区的目的:订单一旦终止,不会重复计算
~~~ 分区的条件:订单创建日期;保证相同的订单在用一个分区
### --- DWD层建表:订单事实表(拉链表)
~~~ 订单事实表(拉链表)
DROP TABLE IF EXISTS dwd.dwd_trade_orders;
create table dwd.dwd_trade_orders(
`orderId` int,
`orderNo` string,
`userId` bigint,
`status` tinyint,
`productMoney` decimal,
`totalMoney` decimal,
`payMethod` tinyint,
`isPay` tinyint,
`areaId` int,
`tradeSrc` tinyint,
`tradeType` int,
`isRefund` tinyint,
`dataFlag` tinyint,
`createTime` string,
`payTime` string,
`modifiedTime` string,
`start_date` string,
`end_date` string
) COMMENT '订单事实拉链表'
partitioned by (dt string)
STORED AS PARQUET;
### --- DWD层数据加载
~~~ 时间日期格式转换
-- 'yyyy-MM-dd HH:mm:ss' => timestamp => 'yyyy-MM-dd'
select unix_timestamp(modifiedtime, 'yyyy-MM-dd HH:mm:ss')
from ods.ods_trade_orders limit 10;
select from_unixtime(unix_timestamp(modifiedtime, 'yyyy-MM-dd
HH:mm:ss'), 'yyyy-MM-dd')
from ods.ods_trade_orders limit 10;
[root@hadoop02 ~]# vim /data/yanqidw/script/trade/dwd_load_trade_orders.sh
#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
INSERT OVERWRITE TABLE dwd.dwd_trade_orders partition(dt)
SELECT orderId,
orderNo,
userId,
status,
productMoney,
totalMoney,
payMethod,
isPay,
areaId,
tradeSrc,
tradeType,
isRefund,
dataFlag,
createTime,
payTime,
modifiedTime,
case when modifiedTime is not null then from_unixtime(unix_timestamp(modifiedTime,
'yyyy-MM-dd HH:mm:ss'),'yyyy-MM-dd') else from_unixtime(unix_timestamp(createTime,
'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd') end as start_date,
'9999-12-31' as end_date,
from_unixtime(unix_timestamp(createTime,
'yyyy-MM-dd HH:mm:ss'),
'yyyy-MM-dd') as dt FROM ods.ods_trade_orders
WHERE dt='$do_date'
union all
SELECT A.orderId,
A.orderNo,
A.userId,
A.status,
A.productMoney,
A.totalMoney,
A.payMethod,
A.isPay,
A.areaId,
A.tradeSrc,
A.tradeType,
A.isRefund,
A.dataFlag,
A.createTime,
A.payTime,
A.modifiedTime,
A.start_date,
CASE WHEN B.orderid IS NOT NULL AND A.end_date > '$do_date'
THEN date_add('$do_date', -1) ELSE A.end_date END AS end_date,
from_unixtime(unix_timestamp(A.createTime,
'yyyy-MM-dd HH:mm:ss'),
'yyyy-MM-dd') as dt FROM (SELECT * FROM dwd.dwd_trade_orders WHERE
dt>date_add('$do_date', -15)) A
left outer join (SELECT * FROM ods.ods_trade_orders
WHERE dt='$do_date') B
ON A.orderId = B.orderId;
"
hive -e "$sql"
~~~ # 加载数据到DWD层订单事实表&拉链表
[root@hadoop02 ~]# sh /data/yanqidw/script/trade/dwd_load_trade_orders.sh 2020-07-12
~~~ # 查看DWD层订单事实表&拉链表是否加载到数据
hive (default)> show partitions dwd.dwd_trade_orders;
partition
dt=2020-06-28
dt=2020-06-29
dt=2020-06-30
dt=2020-07-01
dt=2020-07-02
dt=2020-07-03
dt=2020-07-04
dt=2020-07-05
dt=2020-07-06
hive (default)> select * from dwd.dwd_trade_orders limit 5;
dwd_trade_orders.orderid dwd_trade_orders.orderno dwd_trade_orders.userid dwd_trade_orders.status dwd_trade_orders.productmoney dwd_trade_orders.totalmoney dwd_trade_orders.paymethod dwd_trade_orders.ispay dwd_trade_orders.areaid dwd_trade_orders.tradesrc dwd_trade_orders.tradetype dwd_trade_orders.isrefund dwd_trade_orders.dataflag dwd_trade_orders.createtime dwd_trade_orders.paytime dwd_trade_orders.modifiedtime dwd_trade_orders.start_date dwd_trade_orders.end_date dwd_trade_orders.dt
1 23a0b124546 98 2 0 10468 2 0 123 0 0 1 2 2020-06-28 18:14:01 2020-06-28 18:14:01 2020-07-12 13:22:22 2020-07-12 9999-12-31 2020-06-28
2 23a0b124546 121 2 0 6331 2 0 123 0 0 0 1 2020-06-28 16:55:02 2020-06-28 16:55:02 2020-07-12 13:22:22 2020-07-12 9999-12-31 2020-06-28
3 23a0b124546 35 2 0 1988 4 0 123 0 0 0 1 2020-06-28 12:07:01 2020-06-28 12:07:01 2020-07-12 13:22:22 2020-07-12 9999-12-31 2020-06-28
4 23a0b124546 161 2 0 43659 4 0 123 0 0 0 1 2020-06-28 13:19:48 2020-06-28 13:19:48 2020-07-12 13:22:22 2020-07-12 9999-12-31 2020-06-28
5 23a0b124546 72 2 0 32757 0 0 123 0 0 0 1 2020-06-28 22:14:21 2020-06-28 22:14:21 2020-07-12 13:22:22 2020-07-12 9999-12-31 2020-06-28
hive (default)> select count(*) from dwd.dwd_trade_orders;
354