DWS层
dws层的数据从dwd层来选取,在建模上进行主题分类,分类后每个主题都制作一张表,按照dwd层的数据,进行每日的轻度聚合,一般是建宽表。
说明:本次DWS层的数据清洗是与DWT(主题宽表层)相对应的,区别在于DWS层是针对于每日进行统计的,而DWT层则是针对全部进行统计。
DWT层 | 每日设备行为 | 每日会员行为 | 每日商品行为 | 每日优惠券统计 (预留) | 每日活动统计(预留) | 每日购买行为 |
---|---|---|---|---|---|---|
DWT层 | 设备主题宽表 | 会员主题宽表 | 商品主题宽表 | 优惠券主题宽表 | 活动主题宽表 | 会员主题(ADS层) |
DWS层每日设备行为(用户行为)
drop table if exists dws_uv_detail_daycount;
create external table dws_uv_detail_daycount
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`login_count` bigint COMMENT '活跃次数'
)
partitioned by(dt string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_daycount'
TBLPROPERTIES('parquet.compression'='lzo');
insert overwrite table dws_uv_detail_daycount partition(dt='2020-03-29')
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang))lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
count(*) login_count
from dwd_start_log
where dt='2020-03-29'
group by mid_id;
DWS层(业务)
每日用户行为
drop table if exists dws_user_action_daycount;
create external table dws_user_action_daycount
(
user_id string comment '用户 id',
login_count bigint comment '登录次数',
cart_count bigint comment '加入购物车次数',
cart_amount double comment '加入购物车金额',
order_count bigint comment '下单次数',
order_amount decimal(16,2) comment '下单金额',
payment_count bigint comment '支付次数',
payment_amount decimal(16,2) comment '支付金额'
) COMMENT '每日用户行为'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action_daycount/'
tblproperties ("parquet.compression"="lzo");
WITH tmp_login AS (
SELECT user_id, count(*) AS login_count
FROM dwd_start_log
WHERE dt = '2020-03-29'
AND user_id IS NOT NULL
GROUP BY user_id
),
tmp_cart AS (
SELECT user_id, count(*) AS cart_count
, sum(cart_price * sku_num) AS cart_amount
FROM dwd_fact_cart_info
WHERE dt = '2020-03-29'
AND user_id IS NOT NULL
AND date_format(create_time, 'yyyy-MM-dd') = '2020-03-29'
GROUP BY user_id
),
tmp_order AS (
SELECT user_id, count(*) AS order_count, sum(final_total_amount) AS order_amount
FROM dwd_fact_order_info
WHERE dt = '2020-03-29'
GROUP BY user_id
),
tmp_payment AS (
SELECT user_id, count(*) AS payment_count, sum(payment_amount) AS payment_amount
FROM dwd_fact_payment_info
WHERE dt = '2020-03-29'
GROUP BY user_id
)
INSERT OVERWRITE INTO dws_user_action_daycount PARTITION (dt='2020-03-29')
SELECT user_actions.user_id, sum(user_actions.login_count), sum(user_actions.cart_count)
, sum(user_actions.cart_amount), sum(user_actions.order_count)
, sum(user_actions.order_amount), sum(user_actions.payment_count)
, sum(user_actions.payment_amount)
FROM (
SELECT user_id, login_count, 0 AS cart_count, 0 AS cart_amount, 0 AS order_count
, 0 AS order_amount, 0 AS payment_count, 0 AS payment_amount
FROM tmp_login
UNION ALL
SELECT user_id, 0 AS login_count, cart_count, cart_amount, 0 AS order_count
, 0 AS order_amount, 0 AS payment_count, 0 AS payment_amount
FROM tmp_cart
UNION ALL
SELECT user_id, 0 AS login_count, 0 AS cart_count, 0 AS cart_amount, order_count
, order_amount, 0 AS payment_count, 0 AS payment_amount
FROM tmp_order
UNION ALL
SELECT user_id, 0 AS login_count, 0 AS cart_count, 0 AS cart_amount, 0 AS order_count
, 0 AS order_amount, payment_count, payment_amount
FROM tmp_payment
) user_actions
GROUP BY user_id;
每日商品行为
drop table if exists dws_sku_action_daycount;
create external table dws_sku_action_daycount
(
sku_id string comment 'sku_id',
order_count bigint comment '被下单次数',
order_num bigint comment '被下单件数',
order_amount decimal(16,2) comment '被下单金额',
payment_count bigint comment '被支付次数',
payment_num bigint comment '被支付件数',
payment_amount decimal(16,2) comment '被支付金额',
refund_count bigint comment '被退款次数',
refund_num bigint comment '被退款件数',
refund_amount decimal(16,2) comment '被退款金额',
cart_count bigint comment '被加入购物车次数',
cart_num bigint comment '被加入购物车件数',
favor_count bigint comment '被收藏次数',
appraise_good_count bigint comment '好评数',
appraise_mid_count bigint comment '中评数',
appraise_bad_count bigint comment '差评数',
appraise_default_count bigint comment '默认评价数'
) COMMENT '每日商品行为'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_sku_action_daycount/'
tblproperties ("parquet.compression"="lzo");
每日地区统计
drop table if exists dws_area_stats_daycount;
create external table dws_area_stats_daycount(
`id` bigint COMMENT '编号',
`province_name` string COMMENT '省份名称',
`area_code` string COMMENT '地区编码',
`iso_code` string COMMENT 'iso 编码',
`region_id` string COMMENT '地区 ID',
`region_name` string COMMENT '地区名称',
`login_count` string COMMENT '活跃设备数',
`order_count` bigint COMMENT '下单次数',
`order_amount` decimal(20,2) COMMENT '下单金额',
`payment_count` bigint COMMENT '支付次数',
`payment_amount` decimal(20,2) COMMENT '支付金额'
) COMMENT '每日地区统计表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_area_stats_daycount/'
tblproperties ("parquet.compression"="lzo");
with
tmp_login as
(
select
area_code,
count(*) login_count
from dwd_start_log
where dt='2020-06-14'
group by area_code
),
tmp_op as
(
select
province_id,
sum(if(date_format(create_time,'yyyy-MM-dd')='2020-06-14',1,0))
order_count,
sum(if(date_format(create_time,'yyyy-MM-dd')='2020-06-14',final_total_amount,
0)) order_amount,
sum(if(date_format(payment_time,'yyyy-MM-dd')='2020-06-14',1,0))
payment_count,
sum(if(date_format(payment_time,'yyyy-MM-dd')='2020-06-14',final_total_amoun
t,0)) payment_amount
from dwd_fact_order_info
where (dt='2020-06-14' or dt=date_add('2020-06-14',-1))
group by province_id
)
insert overwrite table dws_area_stats_daycount partition(dt='2020-06-14')
select
pro.id,
pro.province_name,
pro.area_code,
pro.iso_code,
pro.region_id,
pro.region_name,
nvl(tmp_login.login_count,0),
nvl(tmp_op.order_count,0),
nvl(tmp_op.order_amount,0.0),
nvl(tmp_op.payment_count,0),
nvl(tmp_op.payment_amount,0.0)
from dwd_dim_base_province pro
left join tmp_login on pro.area_code=tmp_login.area_code
left join tmp_op on pro.id=tmp_op.province_id;
每日用户购买行为
drop table if exists dws_sale_detail_daycount;
create external table dws_sale_detail_daycount
(
user_id string comment '用户 id',
sku_id string comment '商品 id',
user_gender string comment '用户性别',
user_age string comment '用户年龄',
user_level string comment '用户等级',
order_price decimal(10,2) comment '商品价格',
sku_name string comment '商品名称',
sku_tm_id string comment '品牌id',
sku_category3_id string comment '商品三级品类id',
sku_category2_id string comment '商品二级品类id',
sku_category1_id string comment '商品一级品类id',
sku_category3_name string comment '商品三级品类名称',
sku_category2_name string comment '商品二级品类名称',
sku_category1_name string comment '商品一级品类名称',
spu_id string comment '商品 spu',
sku_num int comment '购买个数',
order_count bigint comment '当日下单单数',
order_amount decimal(16,2) comment '当日下单金额'
) COMMENT '每日购买行为'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_sale_detail_daycount/'
tblproperties ("parquet.compression"="lzo");
INSERT OVERWRITE INTO dws_sale_detail_daycount PARTITION (dt='2020-03-29')
SELECT op.user_id, op.sku_id, ui.gender
, months_between('2020-03-29', ui.birthday) / 12 AS age
, ui.user_level, si.price, si.sku_name, si.tm_id, si.category3_id
, si.category2_id, si.category1_id, si.category3_name, si.category2_name, si.category1_name
, si.spu_id, op.sku_num, op.order_count, op.order_amount
FROM (
SELECT user_id, sku_id, sum(sku_num) AS sku_num
, count(*) AS order_count, sum(total_amount) AS order_amount
FROM dwd_fact_order_detail
WHERE dt = '2020-03-29'
GROUP BY user_id, sku_id
) op
JOIN (
SELECT *
FROM dwd_dim_user_info_his
WHERE end_date = '9999-99-99'
) ui
ON op.user_id = ui.id
JOIN (
SELECT *
FROM dwd_dim_sku_info
WHERE dt = '2020-03-29'
) si
ON op.sku_id = si.id;
每日优惠券统计(预留)
drop table if exists dws_coupon_use_daycount;
create external table dws_coupon_use_daycount
(
`coupon_id` string COMMENT '优惠券ID',
`coupon_name` string COMMENT '购物券名称',
`coupon_type` string COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
`condition_amount` string COMMENT '满额数',
`condition_num` string COMMENT '满件数',
`activity_id` string COMMENT '活动编号',
`benefit_amount` string COMMENT '减金额',
`benefit_discount` string COMMENT '折扣',
`create_time` string COMMENT '创建时间',
`range_type` string COMMENT '范围类型 1、商品 2、品类 3、品牌',
`spu_id` string COMMENT '商品id',
`tm_id` string COMMENT '品牌id',
`category3_id` string COMMENT '品类id',
`limit_num` string COMMENT '最多领用次数',
`get_count` bigint COMMENT '领用次数',
`using_count` bigint COMMENT '使用(下单)次数',
`used_count` bigint COMMENT '使用(支付)次数'
) COMMENT '每日优惠券统计'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_coupon_use_daycount/'
tblproperties ("parquet.compression"="lzo");