DWS層的構建過程
ADS層需求:
最近1、7、30日各品牌訂單數
最近1、7、30日各品牌订单人数
最近1、7、30日各品類訂單數
最近1、7、30日各品類訂單人數
分析過程:
將相同的統計周期,業務過程,統計粒度放入同一張dws表,需要2*2四張表
dws表的命名規範:dws_数据域_统计粒度_业务过程_统计周期(1d/nd/td)
這四張表分別是:
1.dws_trade_tm_order_1d
2.dws_trade_tm_order_nd
3.dws_trade_category_order_1d
4.dws_trade_category_order_nd
首先來看第一張表的構建過程:dws_trade_tm_order_1d
1.需要哪些字段:
行列分區:
行:由統計的粒度決定:tm_id,tm_name
列:根據派生指標決定(主要由dwd層的事實決定),order_count,order_user_count,order_total_amount
2.分區
按照日期分區,一天一個分區
3.建表語句
DROP TABLE IF EXISTS dws_trade_tm_order_1d;
CREATE EXTERNAL TABLE dws_trade_tm_order_1d
(
`tm_id` STRING COMMENT '品牌编号',
`tm_name` STRING COMMENT '品牌名稱',
`order_count` STRING COMMENT '下單數量',
`order_user_count` STRING COMMENT '下單人數',
`order_total_amount` STRING COMMENT '下單金額'
) COMMENT '交易域下单明细事务事实表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/warehouse/gmall/dwd/dws_trade_tm_order_1d/'
TBLPROPERTIES ('orc.compress' = 'snappy');
4.數據裝載
分析:
-
數據來源于dwd層的下單事實表
-
下單事實表的構建過程:
1.將和下單過程相關聯的業務表找出來,分別是: ods_order_info_inc,ods_order_detail_inc,ods_order_detail_activity_inc,ods_order_detail_coupon_inc
2.主表是ods_order_detail_inc,將這張表和其他表進行left join
3.建表语句
DROP TABLE IF EXISTS dwd_trade_order_detail_inc;
CREATE EXTERNAL TABLE dwd_trade_order_detail_inc
(
`id` STRING COMMENT '编号',
`order_id` STRING COMMENT '订单id',
`user_id` STRING COMMENT '用户id',
`sku_id` STRING COMMENT '商品id',
`province_id` STRING COMMENT '省份id',
`activity_id` STRING COMMENT '参与活动规则id',
`activity_rule_id` STRING COMMENT '参与活动规则id',
`coupon_id` STRING COMMENT '使用优惠券id',
`date_id` STRING COMMENT '下单日期id',
`create_time` STRING COMMENT '下单时间',
`source_id` STRING COMMENT '来源编号',
`source_type_code` STRING COMMENT '来源类型编码',
`source_type_name` STRING COMMENT '来源类型名称',
`sku_num` BIGINT COMMENT '商品数量',
`split_original_amount` DECIMAL(16, 2) COMMENT '原始价格',
`split_activity_amount` DECIMAL(16, 2) COMMENT '活动优惠分摊',
`split_coupon_amount` DECIMAL(16, 2) COMMENT '优惠券优惠分摊',
`split_total_amount` DECIMAL(16, 2) COMMENT '最终价格分摊'
) COMMENT '交易域下单明细事务事实表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/warehouse/gmall/dwd/dwd_trade_order_detail_inc/'
TBLPROPERTIES ('orc.compress' = 'snappy');
- 数据装载
(1)首日装载
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_trade_order_detail_inc partition (dt)
select
od.id,
order_id,
user_id,
sku_id,
province_id,
activity_id,
activity_rule_id,
coupon_id,
date_format(create_time, 'yyyy-MM-dd') date_id,
create_time,
source_id,
source_type,
dic_name,
sku_num,
split_original_amount,
nvl(split_activity_amount,0.0),
nvl(split_coupon_amount,0.0),
split_total_amount,
date_format(create_time,'yyyy-MM-dd')
from
(
select
data.id,
data.order_id,
data.sku_id,
data.create_time,
data.source_id,
data.source_type,
data.sku_num,
data.sku_num * data.order_price split_original_amount,
data.split_total_amount,
data.split_activity_amount,
data.split_coupon_amount
from ods_order_detail_inc
where dt = '2020-06-14'
and type = 'bootstrap-insert'
) od
left join
(
select
data.id,
data.user_id,
data.province_id
from ods_order_info_inc
where dt = '2020-06-14'
and type = 'bootstrap-insert'
) oi
on od.order_id = oi.id
left join
(
select
data.order_detail_id,
data.activity_id,
data.activity_rule_id
from ods_order_detail_activity_inc
where dt = '2020-06-14'
and type = 'bootstrap-insert'
) act
on od.id = act.order_detail_id
left join
(
select
data.order_detail_id,
data.coupon_id
from ods_order_detail_coupon_inc
where dt = '2020-06-14'
and type = 'bootstrap-insert'
) cou
on od.id = cou.order_detail_id
left join
(
select
dic_code,
dic_name
from ods_base_dic_full
where dt='2020-06-14'
and parent_code='24'
)dic
on od.source_type=dic.dic_code;
(2)每日装载
insert overwrite table dwd_trade_order_detail_inc partition (dt='2020-06-15')
select
od.id,
order_id,
user_id,
sku_id,
province_id,
activity_id,
activity_rule_id,
coupon_id,
date_id,
create_time,
source_id,
source_type,
dic_name,
sku_num,
split_original_amount,
nvl(split_activity_amount,0.0),
nvl(split_coupon_amount,0.0),
split_total_amount
from
(
select
data.id,
data.order_id,
data.sku_id,
date_format(data.create_time, 'yyyy-MM-dd') date_id,
data.create_time,
data.source_id,
data.source_type,
data.sku_num,
data.sku_num * data.order_price split_original_amount,
data.split_total_amount,
data.split_activity_amount,
data.split_coupon_amount
from ods_order_detail_inc
where dt = '2020-06-15'
and type = 'insert'
) od
left join
(
select
data.id,
data.user_id,
data.province_id
from ods_order_info_inc
where dt = '2020-06-15'
and type = 'insert'
) oi
on od.order_id = oi.id
left join
(
select
data.order_detail_id,
data.activity_id,
data.activity_rule_id
from ods_order_detail_activity_inc
where dt = '2020-06-15'
and type = 'insert'
) act
on od.id = act.order_detail_id
left join
(
select
data.order_detail_id,
data.coupon_id
from ods_order_detail_coupon_inc
where dt = '2020-06-15'
and type = 'insert'
) cou
on od.id = cou.order_detail_id
left join
(
select
dic_code,
dic_name
from ods_base_dic_full
where dt='2020-06-15'
and parent_code='24'
)dic
on od.source_type=dic.dic_code;
- 可以看出訂單事實表中並沒有tm_id,tm_name,所以需要和商品維度表進行join
1)建表语句
DROP TABLE IF EXISTS dim_sku_full;
CREATE EXTERNAL TABLE dim_sku_full
(
`id` STRING COMMENT 'sku_id',
`price` DECIMAL(16, 2) COMMENT '商品价格',
`sku_name` STRING COMMENT '商品名称',
`sku_desc` STRING COMMENT '商品描述',
`weight` DECIMAL(16, 2) COMMENT '重量',
`is_sale` BOOLEAN COMMENT '是否在售',
`spu_id` STRING COMMENT 'spu编号',
`spu_name` STRING COMMENT 'spu名称',
`category3_id` STRING COMMENT '三级分类id',
`category3_name` STRING COMMENT '三级分类名称',
`category2_id` STRING COMMENT '二级分类id',
`category2_name` STRING COMMENT '二级分类名称',
`category1_id` STRING COMMENT '一级分类id',
`category1_name` STRING COMMENT '一级分类名称',
`tm_id` STRING COMMENT '品牌id',
`tm_name` STRING COMMENT '品牌名称',
`sku_attr_values` ARRAY<STRUCT<attr_id :STRING,value_id :STRING,attr_name :STRING,value_name:STRING>> COMMENT '平台属性',
`sku_sale_attr_values` ARRAY<STRUCT<sale_attr_id :STRING,sale_attr_value_id :STRING,sale_attr_name :STRING,sale_attr_value_name:STRING>> COMMENT '销售属性',
`create_time` STRING COMMENT '创建时间'
) COMMENT '商品维度表'
PARTITIONED BY (`dt` STRING)
STORED AS ORC
LOCATION '/warehouse/gmall/dim/dim_sku_full/'
TBLPROPERTIES ('orc.compress' = 'snappy');
2)数据装载
with
sku as
(
select
id,
price,
sku_name,
sku_desc,
weight,
is_sale,
spu_id,
category3_id,
tm_id,
create_time
from ods_sku_info_full
where dt='2020-06-14'
),
spu as
(
select
id,
spu_name
from ods_spu_info_full
where dt='2020-06-14'
),
c3 as
(
select
id,
name,
category2_id
from ods_base_category3_full
where dt='2020-06-14'
),
c2 as
(
select
id,
name,
category1_id
from ods_base_category2_full
where dt='2020-06-14'
),
c1 as
(
select
id,
name
from ods_base_category1_full
where dt='2020-06-14'
),
tm as
(
select
id,
tm_name
from ods_base_trademark_full
where dt='2020-06-14'
),
attr as
(
select
sku_id,
collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
from ods_sku_attr_value_full
where dt='2020-06-14'
group by sku_id
),
sale_attr as
(
select
sku_id,
collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
from ods_sku_sale_attr_value_full
where dt='2020-06-14'
group by sku_id
)
insert overwrite table dim_sku_full partition(dt='2020-06-14')
select
sku.id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.is_sale,
sku.spu_id,
spu.spu_name,
sku.category3_id,
c3.name,
c3.category2_id,
c2.name,
c2.category1_id,
c1.name,
sku.tm_id,
tm.tm_name,
attr.attrs,
sale_attr.sale_attrs,
sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;
- dws_trade_tm_order_1d 數據裝載過程
---數據裝載
-- 首日:存在歷史數據,也需要統計歷史每一天的最近1日的結果
with t1 as (
select
sku_id,
user_id,
sku_num,
split_total_amount,
dt
from dwd_trade_order_detail_inc
where dt <= '2020-06-14'
),t2 as (
select id,
tm_id,
tm_name
from dim_sku_full
where dt = '2020-06-14'
)
insert overwrite table dws_trade_tm_order_1d partition (dt)
select
tm_id,
tm_name,
count(1),
count(distinct user_id),
sum(split_total_amount),
dt
from t1
left join t2
on t1.sku_id = t2.id
group by dt,tm_id,tm_name;
dws_trade_tm_order_nd
--dws_trade_tm_order_nd
--建表語句
DROP TABLE IF EXISTS dws_trade_tm_order_nd;
CREATE EXTERNAL TABLE dws_trade_tm_order_nd
(
`tm_id` STRING COMMENT '品牌编号',
`tm_name` STRING COMMENT '品牌名稱',
`order_count_7d` STRING COMMENT '最近7日下單數量',
`order_user_count_7d` STRING COMMENT '最近7日下單人數',
`order_total_amount_7d` STRING COMMENT '最近7日下單金額',
`order_count_30d` STRING COMMENT '最近30日下單數量',
`order_user_count_30d` STRING COMMENT '最近30日下單人數',
`order_total_amount_30d` STRING COMMENT '最近30日下單金額'
) COMMENT '交易域下单明细事务事实表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/warehouse/gmall/dwd/dws_trade_tm_order_nd/'
TBLPROPERTIES ('orc.compress' = 'snappy');
insert overwrite table dws_trade_tm_order_nd
select
tm_id,
tm_name,
sum(`if`(dt>=date_sub('2020-06-14',6),order_count,0)) order_count_7d,
sum(`if`(dt>=date_sub('2020-06-14',6),order_user_count,0)) order_user_count_7d,
sum(`if`(dt>=date_sub('2020-06-14',6),order_total_amount,0)) order_total_amount_7d,
sum(order_count) order_count_30d,
sum(order_user_count) order_user_count_30d,
sum(order_total_amount) order_total_amount_30d,
dt
from dws_trade_tm_order_1d
where dt <= '2020-06-14' and dt >=date_sub('2020-06-14',29)
group by dt,tm_name,tm_id;
問題:
這樣的話,會有一個問題:統計的最近7天,最近30天的人數是直接相加的,這樣是錯誤的,同一個用戶可能連續下單了好幾天,那麽這樣統計得到的數據就是錯誤的
解決辦法:
將粒度進行進一步的細分:用戶-品牌粒度
V2版本
dws_trade_user_tm_order_1d
DROP TABLE IF EXISTS dws_trade_user_tm_order_1d;
CREATE EXTERNAL TABLE dws_trade_user_tm_order_1d
(
`user_id` STRING COMMENT '用戶id',
`tm_id` STRING COMMENT '品牌编号',
`tm_name` STRING COMMENT '品牌名稱',
`order_count` STRING COMMENT '下單數量',
`order_total_amount` STRING COMMENT '下單金額'
) COMMENT '交易域用戶品牌粒度订单最近1日汇总表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/warehouse/gmall/dwd/dws_trade_user_tm_order_1d/'
TBLPROPERTIES ('orc.compress' = 'snappy');
with order as (
select user_id,
sku_id,
sku_num,
split_total_amount,
dt
from dwd_trade_order_detail_inc
where dt <= '2020-06-14'
),sku as (
select id,
tm_id,
tm_name
from dim_sku_full
where dt = '2020-06-14'
)
----每個用戶下單的每個品牌的情況
insert overwrite table dws_trade_user_tm_order_1d partition (dt)
select user_id,
tm_id,
tm_name,
sum(sku_num),
sum(split_total_amount),
dt
from order
left join sku
on order.sku_id = sku.id
group by dt, user_id, tm_name, tm_id;
dws_trade_user_tm_order_nd
DROP TABLE IF EXISTS dws_trade_user_tm_order_nd;
CREATE EXTERNAL TABLE dws_trade_user_tm_order_nd
(
`user_id` STRING COMMENT '用戶id',
`tm_id` STRING COMMENT '品牌编号',
`tm_name` STRING COMMENT '品牌名稱',
`order_count_7d` STRING COMMENT '最近7日下單數量',
`order_total_amount_7d` STRING COMMENT '最近7日下單金額',
`order_count_30d` STRING COMMENT '最近30日下單數量',
`order_total_amount_30d` STRING COMMENT '最近30日下單金額'
) COMMENT '交易域用戶品牌粒度订单最近1日汇总表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/warehouse/gmall/dwd/dws_trade_user_tm_order_nd/'
TBLPROPERTIES ('orc.compress' = 'snappy');
insert overwrite table dws_trade_user_tm_order_nd partition (dt)
select
user_id,
tm_id,
tm_name,
sum(if(dt>=date_sub('2020-06-14',6),order_count,0)) order_count_7d,
sum(if(dt>=date_sub('2020-06-14',6),order_total_amount,0)) order_total_amount_7d,
sum(order_count) order_count_30d,
sum(order_total_amount) order_total_amount_30d,
dt
from dws_trade_user_tm_order_1d
where dt <= '2020-06-14' and dt >= date_sub('2020-06-14',29)
group by dt,user_id,tm_id,tm_name;
最近1天個品牌的下單數和下單人數
select
tm_id,
tm_name,
sum(order_count),
count(distinct user_id),
sum(order_total_amount),
dt
from dws_trade_user_tm_order_1d
where dt <= '2020-06-14'
group by dt,tm_name,tm_id;
最近7天和30天各品牌的下單數和下單人數
select
tm_id,
tm_name,
sum(order_count_7d),
count(distinct(`if`(order_count_7d>0,user_id,null))),
sum(order_total_amount_30d),
count(distinct user_id),
dt
from dws_trade_user_tm_order_nd
where dt <= '2020-06-14'
group by dt,tm_name,tm_id
在計算這個的時候會有點問題,主要是因爲在計算7天和30天的時候,可能7天内沒有用戶下單,但是7天之外,30天之内的範圍内有下單的用戶,所以需要使用條件查詢
count(distinct(`if`(order_count_7d>0,user_id,null))),
V3版本
再進一步進行粒度的細分,用戶商品粒度
dws_trade_user_tm_order_1d
建表語句:
DROP TABLE IF EXISTS dws_trade_user_sku_order_1d;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_1d
(
`user_id` STRING COMMENT '用戶id',
`sku_id` STRING COMMENT '商品id',
`tm_id` STRING COMMENT '品牌编号',
`tm_name` STRING COMMENT '品牌名稱',
`category3_id` STRING COMMENT '三级分类id',
`category3_name` STRING COMMENT '三级分类名称',
`category2_id` STRING COMMENT '二级分类id',
`category2_name` STRING COMMENT '二级分类名称',
`category1_id` STRING COMMENT '一级分类id',
`category1_name` STRING COMMENT '一级分类名称',
`order_num_1d` STRING COMMENT '最近1日下单件数',
`order_total_amount_1d` STRING COMMENT '最近1日下单件数',
`order_count_1d` BIGINT COMMENT '最近1日下单次数'
) COMMENT '交易域用戶商品粒度订单最近1日汇总表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/warehouse/gmall/dwd/dws_trade_user_sku_order_1d/'
TBLPROPERTIES ('orc.compress' = 'snappy');
數據裝載
dws_1d表每个分区存放当日的统计结果
–数据加载
----首日: [存在历史数据,也需要统计历史每一天的最近1日的结果]
--一個人買同一個商品的金額,件數
with order as (
select
user_id,
sku_id,
count(1) order_count_1d,
sum(sku_num) order_num_1d,
sum(split_total_amount) order_total_amount_1d,
dt
from dwd_trade_order_detail_inc
where dt <= '2020-06-14'
group by dt,user_id,sku_id
),sku as (
select id,
category3_id,
category3_name,
category2_id,
category2_name,
category1_id,
category1_name,
tm_id,
tm_name
from dim_sku_full
where dt = '2020-06-14'
)
----每個用戶下單的每個品牌的情況
insert overwrite table dws_trade_user_tm_order_1d partition (dt)
select `user_id`,
`sku_id`,
`tm_id`,
`tm_name`,
`category3_id`,
`category3_name`,
`category2_id`,
`category2_name`,
`category1_id`,
`category1_name`,
`order_num_1d`,
`order_total_amount_1d`,
`order_count_1d`
from order
left join sku
on order.sku_id = sku.id;
dws_trade_user_sku_order_nd
建表語句
DROP TABLE IF EXISTS dws_trade_user_sku_order_nd;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_nd
(
`user_id` STRING COMMENT '用戶id',
`sku_id` STRING COMMENT '商品id',
`tm_id` STRING COMMENT '品牌编号',
`tm_name` STRING COMMENT '品牌名稱',
`category3_id` STRING COMMENT '三级分类id',
`category3_name` STRING COMMENT '三级分类名称',
`category2_id` STRING COMMENT '二级分类id',
`category2_name` STRING COMMENT '二级分类名称',
`category1_id` STRING COMMENT '一级分类id',
`category1_name` STRING COMMENT '一级分类名称',
`order_num_7d` STRING COMMENT '最近1日下单件数',
`order_total_amount_7d` STRING COMMENT '最近1日下单件数',
`order_count_7d` BIGINT COMMENT '最近1日下单次数',
`order_num_30d` STRING COMMENT '最近1日下单件数',
`order_total_amount_30d` STRING COMMENT '最近1日下单件数',
`order_count_30d` BIGINT COMMENT '最近1日下单次数'
) COMMENT '交易域用戶商品粒度订单最n日汇总表'
PARTITIONED BY (`dt` STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/warehouse/gmall/dwd/dws_trade_user_sku_order_nd/'
TBLPROPERTIES ('orc.compress' = 'snappy');
數據裝載
insert overwrite table dws_trade_user_tm_order_nd partition (dt)
select user_id,
sku_id,
tm_id,
tm_name,
category3_id,
category3_name,
category2_id,
category2_name,
category1_id,
category1_name,
sum(`if`(dt >= date_sub('2020-06-14',6),order_num_1d,0)) order_num_7d,
sum(`if`(dt >= date_sub('2020-06-14',6),order_total_amount_1d,0)) order_total_amount_7d,
sum(`if`(dt >= date_sub('2020-06-14',6),order_count_1d,0)) order_total_amount_7d,
sum(order_num_1d),
sum(order_total_amount_1d),
(order_count_1d),
dt
from dws_trade_user_sku_order_1d
where dt <= '2020-06-14' and dt >= date_sub('2020-06-14',29)
group by dt,sku_id,user_id,tm_id,tm_name,category3_id,category3_name,category2_id,category2_name,category1_id,category1_name;