数据仓库搭建之DWS层搭建
在搭建该层时,我们需要注意的是:
1)本层的设计主要参考指标体系
2)DWS层数据的数据存储格式为orc列式存储+snappy压缩。
3)DWS层表名的命名规范为:dws _ 数据域 _ 统计粒度 _ 业务过程 _ 统计周期(1d/nd/td)
其中1d表示最近一天的聚合统计,nd表示最近n天的聚合统计,而td表示历史至今的聚合统计。
对于DWS层而言,我们需要根据我们的需要来确定是否在DWS层构建聚合数据的表,如果某些数据指标被多个需求使用到,那么我们就需要提前进行汇聚计算。这样会大大提高我们的效率。
1.指标体系确定
我们在先前进行数据仓库设计的时候,根据现有的统计需求整理出了指标体系,后续又根据指标体系抽取出了所有的派生指标:
我们将相同业务过程,相同统计周期以及相同统计粒度的指标都放在一张DWS层表当中。
2.DWS层最近1日汇总表设计
2.1 交易域用户商品粒度订单(下单)最近1日汇总表
2.1.1建表语句
DROP TABLE IF EXISTS dws_trade_user_sku_order_1d;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_1d
(
`user_id` STRING COMMENT '用户id',
`sku_id` STRING COMMENT 'sku_id',
`sku_name` STRING COMMENT 'sku名称',
`category1_id` STRING COMMENT '一级分类id',
`category1_name` STRING COMMENT '一级分类名称',
`category2_id` STRING COMMENT '一级分类id',
`category2_name` STRING COMMENT '一级分类名称',
`category3_id` STRING COMMENT '一级分类id',
`category3_name` STRING COMMENT '一级分类名称',
`tm_id` STRING COMMENT '品牌id',
`tm_name` STRING COMMENT '品牌名称',
`order_count_1d` BIGINT COMMENT '最近1日下单次数',
`order_num_1d` BIGINT COMMENT '最近1日下单件数',
`order_original_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单原始金额',
`activity_reduce_amount_1d` DECIMAL(16, 2) COMMENT '最近1日活动优惠金额',
`coupon_reduce_amount_1d` DECIMAL(16, 2) COMMENT '最近1日优惠券优惠金额',
`order_total_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单最终金额'
) COMMENT '交易域用户商品粒度订单最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_sku_order_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.1.2数据装载
1)首日装载:
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_trade_user_sku_order_1d partition(dt)
select
user_id,
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
order_count_1d,
order_num_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d,
dt
from
(
select
dt,
user_id,
sku_id,
count(*) order_count_1d,
sum(sku_num) order_num_1d,
sum(split_original_amount) order_original_amount_1d,
sum(nvl(split_activity_amount,0.0)) activity_reduce_amount_1d,
sum(nvl(split_coupon_amount,0.0)) coupon_reduce_amount_1d,
sum(split_total_amount) order_total_amount_1d
from dwd_trade_order_detail_inc
group by dt,user_id,sku_id
)od
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-05-01'
)sku
on od.sku_id=sku.id;
2)每日装载:
insert overwrite table dws_trade_user_sku_order_1d partition(dt='2022-05-02')
select
user_id,
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
order_count,
order_num,
order_original_amount,
activity_reduce_amount,
coupon_reduce_amount,
order_total_amount
from
(
select
user_id,
sku_id,
count(*) order_count,
sum(sku_num) order_num,
sum(split_original_amount) order_original_amount,
sum(nvl(split_activity_amount,0)) activity_reduce_amount,
sum(nvl(split_coupon_amount,0)) coupon_reduce_amount,
sum(split_total_amount) order_total_amount
from dwd_trade_order_detail_inc
where dt='2020-06-15'
group by user_id,sku_id
)od
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-05-02'
)sku
on od.sku_id=sku.id;
2.2 交易域用户商品粒度退单最近1日汇总表
2.2.1建表语句
DROP TABLE IF EXISTS dws_trade_user_sku_order_refund_1d;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_refund_1d
(
`user_id` STRING COMMENT '用户id',
`sku_id` STRING COMMENT 'sku_id',
`sku_name` STRING COMMENT 'sku名称',
`category1_id` STRING COMMENT '一级分类id',
`category1_name` STRING COMMENT '一级分类名称',
`category2_id` STRING COMMENT '一级分类id',
`category2_name` STRING COMMENT '一级分类名称',
`category3_id` STRING COMMENT '一级分类id',
`category3_name` STRING COMMENT '一级分类名称',
`tm_id` STRING COMMENT '品牌id',
`tm_name` STRING COMMENT '品牌名称',
`order_refund_count_1d` BIGINT COMMENT '最近1日退单次数',
`order_refund_num_1d` BIGINT COMMENT '最近1日退单件数',
`order_refund_amount_1d` DECIMAL(16, 2) COMMENT '最近1日退单金额'
) COMMENT '交易域用户商品粒度退单最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_sku_order_refund_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.2.2数据装载
1)首日装载
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_trade_user_sku_order_refund_1d partition(dt)
select
user_id,
sku_id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
order_refund_count,
order_refund_num,
order_refund_amount,
dt
from
(
select
dt,
user_id,
sku_id,
count(*) order_refund_count,
sum(refund_num) order_refund_num,
sum(refund_amount) order_refund_amount
from dwd_trade_order_refund_inc
group by dt,user_id,sku_id
)od
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-05-01'
)sku
on od.sku_id=sku.id;
2)每日装载
insert overwrite table dws_trade_user_sku_order_refund_1d partition(dt='2022-05-02')
select
user_id,
sku_id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name,
order_refund_count,
order_refund_num,
order_refund_amount
from
(
select
user_id,
sku_id,
count(*) order_refund_count,
sum(refund_num) order_refund_num,
sum(refund_amount) order_refund_amount
from dwd_trade_order_refund_inc
where dt='2022-05-02'
group by user_id,sku_id
)od
left join
(
select
id,
sku_name,
category1_id,
category1_name,
category2_id,
category2_name,
category3_id,
category3_name,
tm_id,
tm_name
from dim_sku_full
where dt='2022-05-02'
)sku
on od.sku_id=sku.id;
2.3 交易域用户粒度订单(下单)最近1日汇总表
2.3.1建表语句
DROP TABLE IF EXISTS dws_trade_user_order_1d;
CREATE EXTERNAL TABLE dws_trade_user_order_1d
(
`user_id` STRING COMMENT '用户id',
`order_count_1d` BIGINT COMMENT '最近1日下单次数',
`order_num_1d` BIGINT COMMENT '最近1日下单商品件数',
`order_original_amount_1d` DECIMAL(16, 2) COMMENT '最近1日最近1日下单原始金额',
`activity_reduce_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单活动优惠金额',
`coupon_reduce_amount_1d` DECIMAL(16, 2) COMMENT '下单优惠券优惠金额',
`order_total_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单最终金额'
) COMMENT '交易域用户粒度订单最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_order_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.3.2数据装载
1)首日装载
insert overwrite table dws_trade_user_order_1d partition(dt)
select
user_id,
count(distinct(order_id)),
sum(sku_num),
sum(split_original_amount),
sum(nvl(split_activity_amount,0)),
sum(nvl(split_coupon_amount,0)),
sum(split_total_amount),
dt
from dwd_trade_order_detail_inc
group by user_id,dt;
2)每日装载
insert overwrite table dws_trade_user_order_1d partition(dt='2022-05-02')
select
user_id,
count(distinct(order_id)),
sum(sku_num),
sum(split_original_amount),
sum(nvl(split_activity_amount,0)),
sum(nvl(split_coupon_amount,0)),
sum(split_total_amount)
from dwd_trade_order_detail_inc
where dt='2022-05-02'
group by user_id;
2.4 交易域用户粒度加购最近1日汇总表
2.4.1建表语句
DROP TABLE IF EXISTS dws_trade_user_cart_add_1d;
CREATE EXTERNAL TABLE dws_trade_user_cart_add_1d
(
`user_id` STRING COMMENT '用户id',
`cart_add_count_1d` BIGINT COMMENT '最近1日加购次数',
`cart_add_num_1d` BIGINT COMMENT '最近1日加购商品件数'
) COMMENT '交易域用户粒度加购最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_cart_add_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.4.2数据装载
1)首日装载
insert overwrite table dws_trade_user_cart_add_1d partition(dt)
select
user_id,
count(*),
sum(sku_num),
dt
from dwd_trade_cart_add_inc
group by user_id,dt;
2)每日装载
insert overwrite table dws_trade_user_cart_add_1d partition(dt='2022-05-02')
select
user_id,
count(*),
sum(sku_num)
from dwd_trade_cart_add_inc
where dt='2022-05-02'
group by user_id;
2.5 交易域用户粒度支付最近1日汇总表
2.5.1建表语句
DROP TABLE IF EXISTS dws_trade_user_payment_1d;
CREATE EXTERNAL TABLE dws_trade_user_payment_1d
(
`user_id` STRING COMMENT '用户id',
`payment_count_1d` BIGINT COMMENT '最近1日支付次数',
`payment_num_1d` BIGINT COMMENT '最近1日支付商品件数',
`payment_amount_1d` DECIMAL(16, 2) COMMENT '最近1日支付金额'
) COMMENT '交易域用户粒度支付最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_payment_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.5.2数据装载
1)首日装载
insert overwrite table dws_trade_user_payment_1d partition(dt)
select
user_id,
count(distinct(order_id)),
sum(sku_num),
sum(split_payment_amount),
dt
from dwd_trade_pay_detail_suc_inc
group by user_id,dt;
2)每日装载
insert overwrite table dws_trade_user_payment_1d partition(dt='2022-05-02')
select
user_id,
count(distinct(order_id)),
sum(sku_num),
sum(split_payment_amount)
from dwd_trade_pay_detail_suc_inc
where dt='2022-05-02'
group by user_id;
2.6 交易域省份粒度订单最近1日汇总表
2.6.1建表语句
DROP TABLE IF EXISTS dws_trade_province_order_1d;
CREATE EXTERNAL TABLE dws_trade_province_order_1d
(
`province_id` STRING COMMENT '省份id',
`province_name` STRING COMMENT '省份名称',
`area_code` STRING COMMENT '地区编码',
`iso_code` STRING COMMENT '旧版ISO-3166-2编码',
`iso_3166_2` STRING COMMENT '新版版ISO-3166-2编码',
`order_count_1d` BIGINT COMMENT '最近1日下单次数',
`order_original_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单原始金额',
`activity_reduce_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单活动优惠金额',
`coupon_reduce_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单优惠券优惠金额',
`order_total_amount_1d` DECIMAL(16, 2) COMMENT '最近1日下单最终金额'
) COMMENT '交易域省份粒度订单最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_province_order_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.6.2数据装载
1)首日装载
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_trade_province_order_1d partition(dt)
select
province_id,
province_name,
area_code,
iso_code,
iso_3166_2,
order_count_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d,
dt
from
(
select
province_id,
count(distinct(order_id)) order_count_1d,
sum(split_original_amount) order_original_amount_1d,
sum(nvl(split_activity_amount,0)) activity_reduce_amount_1d,
sum(nvl(split_coupon_amount,0)) coupon_reduce_amount_1d,
sum(split_total_amount) order_total_amount_1d,
dt
from dwd_trade_order_detail_inc
group by province_id,dt
)o
left join
(
select
id,
province_name,
area_code,
iso_code,
iso_3166_2
from dim_province_full
where dt='2022-05-01'
)p
on o.province_id=p.id;
2)每日装载
insert overwrite table dws_trade_province_order_1d partition(dt='2022-05-02')
select
province_id,
province_name,
area_code,
iso_code,
iso_3166_2,
order_count_1d,
order_original_amount_1d,
activity_reduce_amount_1d,
coupon_reduce_amount_1d,
order_total_amount_1d
from
(
select
province_id,
count(distinct(order_id)) order_count_1d,
sum(split_original_amount) order_original_amount_1d,
sum(nvl(split_activity_amount,0)) activity_reduce_amount_1d,
sum(nvl(split_coupon_amount,0)) coupon_reduce_amount_1d,
sum(split_total_amount) order_total_amount_1d
from dwd_trade_order_detail_inc
where dt='2022-05-02'
group by province_id
)o
left join
(
select
id,
province_name,
area_code,
iso_code,
iso_3166_2
from dim_province_full
where dt='2022-05-02'
)p
on o.province_id=p.id;
2.7 交易域用户粒度退单最近1日汇总表
2.7.1建表语句
DROP TABLE IF EXISTS dws_trade_user_order_refund_1d;
CREATE EXTERNAL TABLE dws_trade_user_order_refund_1d
(
`user_id` STRING COMMENT '用户id',
`order_refund_count_1d` BIGINT COMMENT '最近1日退单次数',
`order_refund_num_1d` BIGINT COMMENT '最近1日退单商品件数',
`order_refund_amount_1d` DECIMAL(16, 2) COMMENT '最近1日退单金额'
) COMMENT '交易域用户粒度退单最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_trade_user_order_refund_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.7.2数据装载
1)首日装载
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_trade_user_order_refund_1d partition(dt)
select
user_id,
count(*) order_refund_count,
sum(refund_num) order_refund_num,
sum(refund_amount) order_refund_amount,
dt
from dwd_trade_order_refund_inc
group by user_id,dt;
2)每日装载
insert overwrite table dws_trade_user_order_refund_1d partition(dt='2022-05-02')
select
user_id,
count(*),
sum(refund_num),
sum(refund_amount)
from dwd_trade_order_refund_inc
where dt='2022-05-02'
group by user_id;
2.8 流量域会话粒度页面浏览最近1日汇总表
2.8.1建表语句
DROP TABLE IF EXISTS dws_traffic_session_page_view_1d;
CREATE EXTERNAL TABLE dws_traffic_session_page_view_1d
(
`session_id` STRING COMMENT '会话id',
`mid_id` string comment '设备id',
`brand` string comment '手机品牌',
`model` string comment '手机型号',
`operate_system` string comment '操作系统',
`version_code` string comment 'app版本号',
`channel` string comment '渠道',
`during_time_1d` BIGINT COMMENT '最近1日访问时长',
`page_count_1d` BIGINT COMMENT '最近1日访问页面数'
) COMMENT '流量域会话粒度页面浏览最近1日汇总表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_traffic_session_page_view_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.8.2数据装载
insert overwrite table dws_traffic_session_page_view_1d partition(dt='2022-05-01')
select
session_id,
mid_id,
brand,
model,
operate_system,
version_code,
channel,
sum(during_time),
count(*)
from dwd_traffic_page_view_inc
where dt='2022-05-01'
group by session_id,mid_id,brand,model,operate_system,version_code,channel;
2.9 流量域访客页面粒度页面浏览最近1日汇总表
2.9.1建表语句
DROP TABLE IF EXISTS dws_traffic_page_visitor_page_view_1d;
CREATE EXTERNAL TABLE dws_traffic_page_visitor_page_view_1d
(
`mid_id` STRING COMMENT '访客id',
`brand` string comment '手机品牌',
`model` string comment '手机型号',
`operate_system` string comment '操作系统',
`page_id` STRING COMMENT '页面id',
`during_time_1d` BIGINT COMMENT '最近1日浏览时长',
`view_count_1d` BIGINT COMMENT '最近1日访问次数'
) COMMENT '流量域访客页面粒度页面浏览最近1日汇总事实表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dws/dws_traffic_page_visitor_page_view_1d'
TBLPROPERTIES ('orc.compress' = 'snappy');
2.9.2数据装载
insert overwrite table dws_traffic_page_visitor_page_view_1d partition(dt='2022-05-01')
select
mid_id,
brand,
model,
operate_system,
page_id,
sum(during_time),
count(*)
from dwd_traffic_page_view_inc
where dt='2022-05-01'
group by mid_id,brand,model,operate_system,page_id;
2.10 数据装载脚本编写
2.10.1首日数据装载脚本编写
(1)在hadoop102的**/home/hadoop/bin目录下创建dwd_to_dws_1d_init.sh**
脚本内容如下所示:
#!/bin/bash
APP=gmall
if [ -n "$2" ]