DWS層的構建過程

DWS層的構建過程

ADS層需求:

最近1730日各品牌訂單數
最近1730日各品牌订单人数	
最近1730日各品類訂單數
最近1730日各品類訂單人數	

分析過程:

在这里插入图片描述
將相同的統計周期,業務過程,統計粒度放入同一張dws表,需要2*2四張表
dws表的命名規範:dws_数据域_统计粒度_业务过程_统计周期(1d/nd/td)
這四張表分別是:

1.dws_trade_tm_order_1d
2.dws_trade_tm_order_nd
3.dws_trade_category_order_1d
4.dws_trade_category_order_nd

首先來看第一張表的構建過程:dws_trade_tm_order_1d

1.需要哪些字段:

行列分區:
行:由統計的粒度決定:tm_id,tm_name
列:根據派生指標決定(主要由dwd層的事實決定),order_count,order_user_count,order_total_amount

2.分區

按照日期分區,一天一個分區

3.建表語句

DROP TABLE IF EXISTS dws_trade_tm_order_1d;
CREATE EXTERNAL TABLE dws_trade_tm_order_1d
(
    `tm_id`              STRING COMMENT '品牌编号',
    `tm_name`            STRING COMMENT '品牌名稱',
    `order_count`        STRING COMMENT '下單數量',
    `order_user_count`   STRING COMMENT '下單人數',
    `order_total_amount` STRING COMMENT '下單金額'
) COMMENT '交易域下单明细事务事实表'
    PARTITIONED BY (`dt` STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dwd/dws_trade_tm_order_1d/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

4.數據裝載

分析:

  • 數據來源于dwd層的下單事實表

  • 下單事實表的構建過程:

    1.將和下單過程相關聯的業務表找出來,分別是: ods_order_info_inc,ods_order_detail_inc,ods_order_detail_activity_inc,ods_order_detail_coupon_inc
    2.主表是ods_order_detail_inc,將這張表和其他表進行left join
    3.建表语句

DROP TABLE IF EXISTS dwd_trade_order_detail_inc;
CREATE EXTERNAL TABLE dwd_trade_order_detail_inc
(
    `id`                    STRING COMMENT '编号',
    `order_id`              STRING COMMENT '订单id',
    `user_id`               STRING COMMENT '用户id',
    `sku_id`                STRING COMMENT '商品id',
    `province_id`           STRING COMMENT '省份id',
    `activity_id`           STRING COMMENT '参与活动规则id',
    `activity_rule_id`      STRING COMMENT '参与活动规则id',
    `coupon_id`             STRING COMMENT '使用优惠券id',
    `date_id`               STRING COMMENT '下单日期id',
    `create_time`           STRING COMMENT '下单时间',
    `source_id`             STRING COMMENT '来源编号',
    `source_type_code`      STRING COMMENT '来源类型编码',
    `source_type_name`      STRING COMMENT '来源类型名称',
    `sku_num`               BIGINT COMMENT '商品数量',
    `split_original_amount` DECIMAL(16, 2) COMMENT '原始价格',
    `split_activity_amount` DECIMAL(16, 2) COMMENT '活动优惠分摊',
    `split_coupon_amount`   DECIMAL(16, 2) COMMENT '优惠券优惠分摊',
    `split_total_amount`    DECIMAL(16, 2) COMMENT '最终价格分摊'
) COMMENT '交易域下单明细事务事实表'
    PARTITIONED BY (`dt` STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dwd/dwd_trade_order_detail_inc/'
    TBLPROPERTIES ('orc.compress' = 'snappy');
  1. 数据装载
(1)首日装载
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_trade_order_detail_inc partition (dt)
select
    od.id,
    order_id,
    user_id,
    sku_id,
    province_id,
    activity_id,
    activity_rule_id,
    coupon_id,
    date_format(create_time, 'yyyy-MM-dd') date_id,
    create_time,
    source_id,
    source_type,
    dic_name,
    sku_num,
    split_original_amount,
    nvl(split_activity_amount,0.0),
    nvl(split_coupon_amount,0.0),
    split_total_amount,
    date_format(create_time,'yyyy-MM-dd')
from
(
    select
        data.id,
        data.order_id,
        data.sku_id,
        data.create_time,
        data.source_id,
        data.source_type,
        data.sku_num,
        data.sku_num * data.order_price split_original_amount,
        data.split_total_amount,
        data.split_activity_amount,
        data.split_coupon_amount
    from ods_order_detail_inc
    where dt = '2020-06-14'
    and type = 'bootstrap-insert'
) od
left join
(
    select
        data.id,
        data.user_id,
        data.province_id
    from ods_order_info_inc
    where dt = '2020-06-14'
    and type = 'bootstrap-insert'
) oi
on od.order_id = oi.id
left join
(
    select
        data.order_detail_id,
        data.activity_id,
        data.activity_rule_id
    from ods_order_detail_activity_inc
    where dt = '2020-06-14'
    and type = 'bootstrap-insert'
) act
on od.id = act.order_detail_id
left join
(
    select
        data.order_detail_id,
        data.coupon_id
    from ods_order_detail_coupon_inc
    where dt = '2020-06-14'
    and type = 'bootstrap-insert'
) cou
on od.id = cou.order_detail_id
left join
(
    select
        dic_code,
        dic_name
    from ods_base_dic_full
    where dt='2020-06-14'
    and parent_code='24'
)dic
on od.source_type=dic.dic_code;
(2)每日装载
insert overwrite table dwd_trade_order_detail_inc partition (dt='2020-06-15')
select
    od.id,
    order_id,
    user_id,
    sku_id,
    province_id,
    activity_id,
    activity_rule_id,
    coupon_id,
    date_id,
    create_time,
    source_id,
    source_type,
    dic_name,
    sku_num,
    split_original_amount,
    nvl(split_activity_amount,0.0),
    nvl(split_coupon_amount,0.0),
    split_total_amount
from
(
    select
        data.id,
        data.order_id,
        data.sku_id,
        date_format(data.create_time, 'yyyy-MM-dd') date_id,
        data.create_time,
        data.source_id,
        data.source_type,
        data.sku_num,
        data.sku_num * data.order_price split_original_amount,
        data.split_total_amount,
        data.split_activity_amount,
        data.split_coupon_amount
    from ods_order_detail_inc
    where dt = '2020-06-15'
    and type = 'insert'
) od
left join
(
    select
        data.id,
        data.user_id,
        data.province_id
    from ods_order_info_inc
    where dt = '2020-06-15'
    and type = 'insert'
) oi
on od.order_id = oi.id
left join
(
    select
        data.order_detail_id,
        data.activity_id,
        data.activity_rule_id
    from ods_order_detail_activity_inc
    where dt = '2020-06-15'
    and type = 'insert'
) act
on od.id = act.order_detail_id
left join
(
    select
        data.order_detail_id,
        data.coupon_id
    from ods_order_detail_coupon_inc
    where dt = '2020-06-15'
    and type = 'insert'
) cou
on od.id = cou.order_detail_id
left join
(
    select
        dic_code,
        dic_name
    from ods_base_dic_full
    where dt='2020-06-15'
    and parent_code='24'
)dic
on od.source_type=dic.dic_code;

  • 可以看出訂單事實表中並沒有tm_id,tm_name,所以需要和商品維度表進行join
1)建表语句
DROP TABLE IF EXISTS dim_sku_full;
CREATE EXTERNAL TABLE dim_sku_full
(
    `id`                   STRING COMMENT 'sku_id',
    `price`                DECIMAL(16, 2) COMMENT '商品价格',
    `sku_name`             STRING COMMENT '商品名称',
    `sku_desc`             STRING COMMENT '商品描述',
    `weight`               DECIMAL(16, 2) COMMENT '重量',
    `is_sale`              BOOLEAN COMMENT '是否在售',
    `spu_id`               STRING COMMENT 'spu编号',
    `spu_name`             STRING COMMENT 'spu名称',
    `category3_id`         STRING COMMENT '三级分类id',
    `category3_name`       STRING COMMENT '三级分类名称',
    `category2_id`         STRING COMMENT '二级分类id',
    `category2_name`       STRING COMMENT '二级分类名称',
    `category1_id`         STRING COMMENT '一级分类id',
    `category1_name`       STRING COMMENT '一级分类名称',
    `tm_id`                STRING COMMENT '品牌id',
    `tm_name`              STRING COMMENT '品牌名称',
    `sku_attr_values`      ARRAY<STRUCT<attr_id :STRING,value_id :STRING,attr_name :STRING,value_name:STRING>> COMMENT '平台属性',
    `sku_sale_attr_values` ARRAY<STRUCT<sale_attr_id :STRING,sale_attr_value_id :STRING,sale_attr_name :STRING,sale_attr_value_name:STRING>> COMMENT '销售属性',
    `create_time`          STRING COMMENT '创建时间'
) COMMENT '商品维度表'
    PARTITIONED BY (`dt` STRING)
    STORED AS ORC
    LOCATION '/warehouse/gmall/dim/dim_sku_full/'
    TBLPROPERTIES ('orc.compress' = 'snappy');
2)数据装载
with
sku as
(
    select
        id,
        price,
        sku_name,
        sku_desc,
        weight,
        is_sale,
        spu_id,
        category3_id,
        tm_id,
        create_time
    from ods_sku_info_full
    where dt='2020-06-14'
),
spu as
(
    select
        id,
        spu_name
    from ods_spu_info_full
    where dt='2020-06-14'
),
c3 as
(
    select
        id,
        name,
        category2_id
    from ods_base_category3_full
    where dt='2020-06-14'
),
c2 as
(
    select
        id,
        name,
        category1_id
    from ods_base_category2_full
    where dt='2020-06-14'
),
c1 as
(
    select
        id,
        name
    from ods_base_category1_full
    where dt='2020-06-14'
),
tm as
(
    select
        id,
        tm_name
    from ods_base_trademark_full
    where dt='2020-06-14'
),
attr as
(
    select
        sku_id,
        collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
    from ods_sku_attr_value_full
    where dt='2020-06-14'
    group by sku_id
),
sale_attr as
(
    select
        sku_id,
        collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
    from ods_sku_sale_attr_value_full
    where dt='2020-06-14'
    group by sku_id
)
insert overwrite table dim_sku_full partition(dt='2020-06-14')
select
    sku.id,
    sku.price,
    sku.sku_name,
    sku.sku_desc,
    sku.weight,
    sku.is_sale,
    sku.spu_id,
    spu.spu_name,
    sku.category3_id,
    c3.name,
    c3.category2_id,
    c2.name,
    c2.category1_id,
    c1.name,
    sku.tm_id,
    tm.tm_name,
    attr.attrs,
    sale_attr.sale_attrs,
    sku.create_time
from sku
left join spu on sku.spu_id=spu.id
left join c3 on sku.category3_id=c3.id
left join c2 on c3.category2_id=c2.id
left join c1 on c2.category1_id=c1.id
left join tm on sku.tm_id=tm.id
left join attr on sku.id=attr.sku_id
left join sale_attr on sku.id=sale_attr.sku_id;

  • dws_trade_tm_order_1d 數據裝載過程
---數據裝載
-- 首日:存在歷史數據,也需要統計歷史每一天的最近1日的結果

with t1 as (
    select
        sku_id,
        user_id,
        sku_num,
        split_total_amount,
        dt
    from dwd_trade_order_detail_inc
    where dt <= '2020-06-14'
),t2 as (
    select id,
           tm_id,
           tm_name
    from dim_sku_full
    where dt = '2020-06-14'
)
insert overwrite table dws_trade_tm_order_1d partition (dt)
select
    tm_id,
    tm_name,
    count(1),
    count(distinct user_id),
    sum(split_total_amount),
    dt
from t1
left join t2
on t1.sku_id = t2.id
group by dt,tm_id,tm_name;

dws_trade_tm_order_nd

--dws_trade_tm_order_nd
--建表語句
DROP TABLE IF EXISTS dws_trade_tm_order_nd;
CREATE EXTERNAL TABLE dws_trade_tm_order_nd
(
    `tm_id`                  STRING COMMENT '品牌编号',
    `tm_name`                STRING COMMENT '品牌名稱',
    `order_count_7d`         STRING COMMENT '最近7日下單數量',
    `order_user_count_7d`    STRING COMMENT '最近7日下單人數',
    `order_total_amount_7d`  STRING COMMENT '最近7日下單金額',
    `order_count_30d`        STRING COMMENT '最近30日下單數量',
    `order_user_count_30d`   STRING COMMENT '最近30日下單人數',
    `order_total_amount_30d` STRING COMMENT '最近30日下單金額'
) COMMENT '交易域下单明细事务事实表'
    PARTITIONED BY (`dt` STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dwd/dws_trade_tm_order_nd/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

insert overwrite table dws_trade_tm_order_nd
select
    tm_id,
    tm_name,
    sum(`if`(dt>=date_sub('2020-06-14',6),order_count,0)) order_count_7d,
    sum(`if`(dt>=date_sub('2020-06-14',6),order_user_count,0)) order_user_count_7d,
    sum(`if`(dt>=date_sub('2020-06-14',6),order_total_amount,0)) order_total_amount_7d,
    sum(order_count) order_count_30d,
    sum(order_user_count) order_user_count_30d,
    sum(order_total_amount) order_total_amount_30d,
    dt
from dws_trade_tm_order_1d
where dt <= '2020-06-14' and dt >=date_sub('2020-06-14',29)
group by dt,tm_name,tm_id;

問題:

這樣的話,會有一個問題:統計的最近7天,最近30天的人數是直接相加的,這樣是錯誤的,同一個用戶可能連續下單了好幾天,那麽這樣統計得到的數據就是錯誤的

解決辦法:

將粒度進行進一步的細分:用戶-品牌粒度

V2版本

dws_trade_user_tm_order_1d

DROP TABLE IF EXISTS dws_trade_user_tm_order_1d;
CREATE EXTERNAL TABLE dws_trade_user_tm_order_1d
(

    `user_id`            STRING COMMENT '用戶id',
    `tm_id`              STRING COMMENT '品牌编号',
    `tm_name`            STRING COMMENT '品牌名稱',
    `order_count`        STRING COMMENT '下單數量',
    `order_total_amount` STRING COMMENT '下單金額'
) COMMENT '交易域用戶品牌粒度订单最近1日汇总表'
    PARTITIONED BY (`dt` STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dwd/dws_trade_user_tm_order_1d/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

with order as (
    select user_id,
           sku_id,
           sku_num,
           split_total_amount,
           dt
    from dwd_trade_order_detail_inc
    where dt <= '2020-06-14'
),sku as (
    select id,
           tm_id,
           tm_name
    from dim_sku_full
    where dt = '2020-06-14'
)

----每個用戶下單的每個品牌的情況
insert overwrite table dws_trade_user_tm_order_1d partition (dt)
select user_id,
       tm_id,
       tm_name,
       sum(sku_num),
       sum(split_total_amount),
       dt
from order
left join sku
on order.sku_id = sku.id
group by dt, user_id, tm_name, tm_id;

dws_trade_user_tm_order_nd

DROP TABLE IF EXISTS dws_trade_user_tm_order_nd;
CREATE EXTERNAL TABLE dws_trade_user_tm_order_nd
(
    `user_id`                STRING COMMENT '用戶id',
    `tm_id`                  STRING COMMENT '品牌编号',
    `tm_name`                STRING COMMENT '品牌名稱',
    `order_count_7d`         STRING COMMENT '最近7日下單數量',
    `order_total_amount_7d`  STRING COMMENT '最近7日下單金額',
    `order_count_30d`        STRING COMMENT '最近30日下單數量',
    `order_total_amount_30d` STRING COMMENT '最近30日下單金額'
) COMMENT '交易域用戶品牌粒度订单最近1日汇总表'
    PARTITIONED BY (`dt` STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dwd/dws_trade_user_tm_order_nd/'
    TBLPROPERTIES ('orc.compress' = 'snappy');


insert overwrite table dws_trade_user_tm_order_nd partition (dt)
select
       user_id,
       tm_id,
       tm_name,
       sum(if(dt>=date_sub('2020-06-14',6),order_count,0)) order_count_7d,
       sum(if(dt>=date_sub('2020-06-14',6),order_total_amount,0))  order_total_amount_7d,
       sum(order_count) order_count_30d,
       sum(order_total_amount) order_total_amount_30d,
       dt
from dws_trade_user_tm_order_1d
where dt <= '2020-06-14' and dt >= date_sub('2020-06-14',29)
group by dt,user_id,tm_id,tm_name;

最近1天個品牌的下單數和下單人數

select
       tm_id,
       tm_name,
       sum(order_count),
       count(distinct user_id),
       sum(order_total_amount),
       dt
from dws_trade_user_tm_order_1d
where dt <= '2020-06-14'
group by dt,tm_name,tm_id;

最近7天和30天各品牌的下單數和下單人數

select
       tm_id,
       tm_name,
       sum(order_count_7d),
       count(distinct(`if`(order_count_7d>0,user_id,null))),
       sum(order_total_amount_30d),
       count(distinct user_id),
       dt
from dws_trade_user_tm_order_nd
where dt <= '2020-06-14'
group by dt,tm_name,tm_id

在計算這個的時候會有點問題,主要是因爲在計算7天和30天的時候,可能7天内沒有用戶下單,但是7天之外,30天之内的範圍内有下單的用戶,所以需要使用條件查詢

  count(distinct(`if`(order_count_7d>0,user_id,null))),

V3版本

再進一步進行粒度的細分,用戶商品粒度
在这里插入图片描述

dws_trade_user_tm_order_1d

建表語句:


DROP TABLE IF EXISTS dws_trade_user_sku_order_1d;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_1d
(

    `user_id`            STRING COMMENT '用戶id',
    `sku_id`             STRING COMMENT '商品id',
    `tm_id`              STRING COMMENT '品牌编号',
    `tm_name`            STRING COMMENT '品牌名稱',
    `category3_id`       STRING COMMENT '三级分类id',
    `category3_name`     STRING COMMENT '三级分类名称',
    `category2_id`       STRING COMMENT '二级分类id',
    `category2_name`     STRING COMMENT '二级分类名称',
    `category1_id`       STRING COMMENT '一级分类id',
    `category1_name`     STRING COMMENT '一级分类名称',
    `order_num_1d`        STRING COMMENT '最近1日下单件数',
    `order_total_amount_1d` STRING COMMENT '最近1日下单件数',
    `order_count_1d`            BIGINT COMMENT '最近1日下单次数'
) COMMENT '交易域用戶商品粒度订单最近1日汇总表'
    PARTITIONED BY (`dt` STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dwd/dws_trade_user_sku_order_1d/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

數據裝載

dws_1d表每个分区存放当日的统计结果
–数据加载
----首日: [存在历史数据,也需要统计历史每一天的最近1日的结果]

--一個人買同一個商品的金額,件數
with order as (
    select
           user_id,
            sku_id,
            count(1) order_count_1d,
            sum(sku_num) order_num_1d,
           sum(split_total_amount) order_total_amount_1d,
           dt
    from dwd_trade_order_detail_inc
    where dt <= '2020-06-14'
    group by dt,user_id,sku_id
),sku as (
    select id,
           category3_id,
           category3_name,
           category2_id,
           category2_name,
           category1_id,
           category1_name,
           tm_id,
           tm_name
    from dim_sku_full
    where dt = '2020-06-14'
)

----每個用戶下單的每個品牌的情況
insert overwrite table dws_trade_user_tm_order_1d partition (dt)
select `user_id`,
       `sku_id`,
       `tm_id`,
       `tm_name`,
       `category3_id`,
       `category3_name`,
       `category2_id`,
       `category2_name`,
       `category1_id`,
       `category1_name`,
       `order_num_1d`,
       `order_total_amount_1d`,
       `order_count_1d`
from order
left join sku
on order.sku_id = sku.id;

dws_trade_user_sku_order_nd

建表語句

DROP TABLE IF EXISTS dws_trade_user_sku_order_nd;
CREATE EXTERNAL TABLE dws_trade_user_sku_order_nd
(

    `user_id`                STRING COMMENT '用戶id',
    `sku_id`                 STRING COMMENT '商品id',
    `tm_id`                  STRING COMMENT '品牌编号',
    `tm_name`                STRING COMMENT '品牌名稱',
    `category3_id`           STRING COMMENT '三级分类id',
    `category3_name`         STRING COMMENT '三级分类名称',
    `category2_id`           STRING COMMENT '二级分类id',
    `category2_name`         STRING COMMENT '二级分类名称',
    `category1_id`           STRING COMMENT '一级分类id',
    `category1_name`         STRING COMMENT '一级分类名称',
    `order_num_7d`           STRING COMMENT '最近1日下单件数',
    `order_total_amount_7d`  STRING COMMENT '最近1日下单件数',
    `order_count_7d`         BIGINT COMMENT '最近1日下单次数',
    `order_num_30d`          STRING COMMENT '最近1日下单件数',
    `order_total_amount_30d` STRING COMMENT '最近1日下单件数',
    `order_count_30d`        BIGINT COMMENT '最近1日下单次数'
) COMMENT '交易域用戶商品粒度订单最n日汇总表'
    PARTITIONED BY (`dt` STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS ORC
    LOCATION '/warehouse/gmall/dwd/dws_trade_user_sku_order_nd/'
    TBLPROPERTIES ('orc.compress' = 'snappy');

數據裝載


insert overwrite table dws_trade_user_tm_order_nd partition (dt)
select user_id,
       sku_id,
       tm_id,
       tm_name,
       category3_id,
       category3_name,
       category2_id,
       category2_name,
       category1_id,
       category1_name,
       sum(`if`(dt >= date_sub('2020-06-14',6),order_num_1d,0)) order_num_7d,
       sum(`if`(dt >= date_sub('2020-06-14',6),order_total_amount_1d,0)) order_total_amount_7d,
       sum(`if`(dt >= date_sub('2020-06-14',6),order_count_1d,0)) order_total_amount_7d,
       sum(order_num_1d),
       sum(order_total_amount_1d),
       (order_count_1d),
       dt
from dws_trade_user_sku_order_1d
where dt <= '2020-06-14' and dt >= date_sub('2020-06-14',29)
group by dt,sku_id,user_id,tm_id,tm_name,category3_id,category3_name,category2_id,category2_name,category1_id,category1_name;

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

m0_37759590

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值