三、数据仓库电商项目——DWD层

一、启动日志start_log清洗到DWD

用到的函数:get_json_object

1)person表xjson字段内容为:
Xjson=[{"name":"大郎","sex":"男","age":"25"},{"name":"西门庆","sex":"男","age":"47"}]

2)取出第一个json对象
SELECT get_json_object(xjson,"$.[0]") FROM person;
结果是:{"name":"大郎","sex":"男","age":"25"}

3)取出第一个json的age字段的值
SELECT get_json_object(xjson,"$.[0].age") FROM person;
结果是:25

启动日志表(start_log)

rop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
`mid_id` string,
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string,
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string,  
`app_time` string,
`network` string, 
`lng` string, 
`lat` string, 
`entry` string, 
`open_ad_type` string, 
`action` string, 
`loading_time` string, 
`detail` string, 
`extend1` string
)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_start_log/'
TBLPROPERTIES('parquet.compression'='lzo'); 
#!/bin/bash

# 定义变量方便修改
APP=gmall

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
	do_date=$1
else 
	do_date=`date -d "-1 day" +%F`  
fi 

sql="
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table "$APP".dwd_start_log
PARTITION (dt='$do_date')
select 
    get_json_object(line,'$.mid') mid_id,
    get_json_object(line,'$.uid') user_id,
    get_json_object(line,'$.vc') version_code,
    get_json_object(line,'$.vn') version_name,
    get_json_object(line,'$.l') lang,
    get_json_object(line,'$.sr') source,
    get_json_object(line,'$.os') os,
    get_json_object(line,'$.ar') area,
    get_json_object(line,'$.md') model,
    get_json_object(line,'$.ba') brand,
    get_json_object(line,'$.sv') sdk_version,
    get_json_object(line,'$.g') gmail,
    get_json_object(line,'$.hw') height_width,
    get_json_object(line,'$.t') app_time,
    get_json_object(line,'$.nw') network,
    get_json_object(line,'$.ln') lng,
    get_json_object(line,'$.la') lat,
    get_json_object(line,'$.entry') entry,
    get_json_object(line,'$.open_ad_type') open_ad_type,
    get_json_object(line,'$.action') action,
    get_json_object(line,'$.loading_time') loading_time,
    get_json_object(line,'$.detail') detail,
    get_json_object(line,'$.extend1') extend1
from "$APP".ods_start_log 
where dt='$do_date';
"

$hive -e "$sql"

事件表(event_log)

事件日志中需要编写UDF,UDTF进行过滤

ods_event_log中拿出一条数据,事件中包含事件

1605927018899|{"cm":
{"ln":"-39.8","sv":"V2.9.8",
	"os":"8.2.2","g":"RR02OCPX@gmail.com",
	"mid":"0","nw":"3G","l":"es","vc":"8",
	"hw":"640*1136","ar":"MX","uid":"0",
	"t":"1605921511845","la":"28.5",
	"md":"Huawei-8","vn":"1.2.7",
	"ba":"Huawei","sr":"F"},
	ap":"app",

"et":[{"ett":"1605904328376","en":"newsdetail",
	"kv":{"entry":"1",
	"goodsid":"0",
	"news_staytime":"18",
	"loading_time":"4",
	"action":"4",
	"showtype":"3",
	"category":"27",
	"type1":"433"}},
	
	{"ett":"1605850106419","en":"notification",
	
	"kv":{"ap_time":"1605914557494",
		"action":"3",
		"type":"4",
		"content":""}},
	
	{"ett":"1605905731874",
		"en":"active_background",
		
		"kv":{"active_source":"1"}}]}	
drop table if exists dwd_base_event_log;
CREATE EXTERNAL TABLE dwd_base_event_log(
`mid_id` string,
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string,
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string, 
`network` string, 
`lng` string, 
`lat` string, 
`event_name` string, 
`event_json` string, 
`server_time` string)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_base_event_log/'
TBLPROPERTIES('parquet.compression'='lzo');

写脚本导入数据ods_to_dwd_base_log.sh

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
	do_date=$1
else 
	do_date=`date -d "-1 day" +%F`  
fi 

sql="
use gmall;
insert overwrite table "$APP".dwd_base_event_log partition(dt='$do_date')
select
    base_analizer(line,'mid') as mid_id,
    base_analizer(line,'uid') as user_id,
    base_analizer(line,'vc') as version_code,
    base_analizer(line,'vn') as version_name,
    base_analizer(line,'l') as lang,
    base_analizer(line,'sr') as source,
    base_analizer(line,'os') as os,
    base_analizer(line,'ar') as area,
    base_analizer(line,'md') as model,
    base_analizer(line,'ba') as brand,
    base_analizer(line,'sv') as sdk_version,
    base_analizer(line,'g') as gmail,
    base_analizer(line,'hw') as height_width,
    base_analizer(line,'t') as app_time,
    base_analizer(line,'nw') as network,
    base_analizer(line,'ln') as lng,
    base_analizer(line,'la') as lat,
    event_name,
    event_json,
    base_analizer(line,'st') as server_time
from "$APP".ods_event_log lateral view flat_analizer(base_analizer(line,'et')) tem_flat as event_name,event_json
where dt='$do_date'  and base_analizer(line,'et')<>'';
"
#cdh
# sudu -u hive hive -e "$sql"
$hive -e "$sql"

DWD层用户行为事件表获取

说明:因为事件表已经拆分开,所以这里需要建设10张表来进行存储, (数据来源 dwd_base_event_log)

商品点击表 dwd_display_log 

在这里插入图片描述

商品详情页表 

drop table if exists dwd_newsdetail_log;
CREATE EXTERNAL TABLE dwd_newsdetail_log(
`mid_id` string,
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string,
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string, 
`app_time` string,  
`network` string, 
`lng` string, 
`lat` string, 
`entry` string,
`action` string,
`goodsid` string,
`showtype` string,
`news_staytime` string,
`loading_time` string,
`type1` string,
`category` string,
`server_time` string)
PARTITIONED BY (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_newsdetail_log/'
TBLPROPERTIES('parquet.compression'='lzo');
hive (gmall)> 
insert overwrite table dwd_newsdetail_log PARTITION (dt='2020-03-29')
select 
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
get_json_object(event_json,'$.kv.entry') entry,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.goodsid') goodsid,
get_json_object(event_json,'$.kv.showtype') showtype,
get_json_object(event_json,'$.kv.news_staytime') news_staytime,
get_json_object(event_json,'$.kv.loading_time') loading_time,
get_json_object(event_json,'$.kv.type1') type1,
get_json_object(event_json,'$.kv.category') category,
server_time
from dwd_base_event_log
where dt='2020-03-29' and event_name='newsdetail';

点赞表 

评论表 

收藏表

消息通知表 

用户后台活跃表

二、数据仓库DWD层——业务数据

本层主要表类型:

  • 存储各业务表的全量快照
  • 存储各业务表的拉链表

        每一张维度表基本都是从ods层中筛选字段进行导入,有些导入不是很复杂,但是有些导入会涉及多个join,多个join也意味着资源高、效率低的缺点,这一块可以多了解优化,面试必备技巧。下面就要进行业务层DWD层的建模了

商品维度表(​​​​​​​全量数据快照)

活动维度表(全量数据快照)

在这里插入图片描述
可以看到红色字段通过left join 进行筛选

地区维度表(全量数据快照)

 用户维度表(拉链表)

用户表中的数据每日既有可能新增,也有可能修改,但修改频率并不高,属于缓慢变化维度,此处采用拉链表存储用户维度数据。 

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

INSERT OVERWRITE INTO dwd_dim_user_info_his_tmp
SELECT *
FROM (
	SELECT id, name, birthday, gender, email
		, user_level, create_time, operate_time, '2020-03-29' AS start_date, '9999-99-99' AS end_date
	FROM ods_user_info
	WHERE dt = '2020-03-29'
	UNION ALL
	SELECT uh.id, uh.name, uh.birthday, uh.gender, uh.email
		, uh.user_level, uh.create_time, uh.operate_time, uh.start_date
		, if(ui.id IS NOT NULL
			AND uh.end_date = '9999-99-99', date_add(ui.dt, -1), uh.end_date) AS end_date
	FROM dwd_dim_user_info_his uh
		LEFT JOIN (
			SELECT *
			FROM ods_user_info
			WHERE dt = '2020-03-29'
		) ui
		ON uh.id = ui.id
) his
ORDER BY his.id, start_date;

INSERT OVERWRITE INTO dwd_dim_user_info_his
SELECT *
FROM dwd_dim_user_info_his_tmp;

订单明细事实表(事务型快照事实表) 

在这里插入图片描述

drop table if exists dwd_fact_order_detail;
create external table dwd_fact_order_detail (
    `id` string COMMENT '订单编号',
    `order_id` string COMMENT '订单号',
    `user_id` string COMMENT '用户id',
    `sku_id` string COMMENT 'sku商品id',
    `sku_name` string COMMENT '商品名称',
    `order_price` decimal(10,2) COMMENT '商品价格',
    `sku_num` bigint COMMENT '商品数量',
    `create_time` string COMMENT '创建时间',
    `province_id` string COMMENT '省份ID',
    `total_amount` decimal(20,2) COMMENT '商品总金额'
) 
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_order_detail/'
tblproperties ("parquet.compression"="lzo"); 
insert overwrite table dwd_fact_order_detail partition(dt='2020-03-29')
select
    od.id,
    od.order_id,
    od.user_id,
    od.sku_id,
    od.sku_name,
    od.order_price,
    od.sku_num,
    od.create_time,
    oi.province_id,
    od.order_price*od.sku_num   
from 
(
    select * from ods_order_detail where dt='2020-03-29'
) od
join 
(
    select * from ods_order_info where dt='2020-03-29'
) oi
on od.order_id=oi.id; 

支付事实表 (事务型快照事实表)

在这里插入图片描述
在这里插入图片描述

退款事实表(事务型快照事实表) 

把ODS层ods_order_refund_info表数据导入到DWD层退款事实表,在导入过程中可以做适当的清洗。
在这里插入图片描述

drop table if exists dwd_fact_order_refund_info;
create external table dwd_fact_order_refund_info(
    `id` string COMMENT '编号',
    `user_id` string COMMENT '用户ID',
    `order_id` string COMMENT '订单ID',
    `sku_id` string COMMENT '商品ID',
    `refund_type` string COMMENT '退款类型',
    `refund_num` bigint COMMENT '退款件数',
    `refund_amount` decimal(16,2) COMMENT '退款金额',
    `refund_reason_type` string COMMENT '退款原因类型',
    `create_time` string COMMENT '退款时间'
) COMMENT '退款事实表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_order_refund_info/'
tblproperties ("parquet.compression"="lzo");


insert overwrite table dwd_fact_order_refund_info partition(dt='2020-03-29')
select
    id,
    user_id,
    order_id,
    sku_id,
    refund_type,
    refund_num,
    refund_amount,
    refund_reason_type,
    create_time
from ods_order_refund_info
where dt='2020-03-29';

订单事实表(累积型快照事实表,动态分区)

设计

在这里插入图片描述
黄色为历史分区,以创建日期分区存放
绿色为增量数据,根据创建时间可以划分出来的分区
蓝色为合并后的分区
从黄色部分取和绿色划分相同的分区进行数据合并,再动态写回去

insert overwrite table bule partition(dt)
select coalesce(green.columns,yellow.columns),coalesce(green.dt,yellow.dt)
from yellow outer full join
green on  yellow.key = green.key and green.dt = '${dt}'
where yellow.dt in(
    select distinct substring(replace(create_time,'-'),1,7)             –## '20201001'
    from green where dt = '${dt}'
) 

在这里插入图片描述
订单生命周期:创建时间=>支付时间=>取消时间=>完成时间=>退款时间=>退款完成时间。

由于ODS层订单表只有创建时间和操作时间两个状态,不能表达所有时间含义,所以需要关联订单状态表。订单事实表里面增加了活动id,所以需要关联活动订单表。

drop table if exists dwd_fact_order_info;
create external table dwd_fact_order_info (
    `id` string COMMENT '订单编号',
    `order_status` string COMMENT '订单状态',
    `user_id` string COMMENT '用户id',
    `out_trade_no` string COMMENT '支付流水号',
    `create_time` string COMMENT '创建时间(未支付状态)',
    `payment_time` string COMMENT '支付时间(已支付状态)',
    `cancel_time` string COMMENT '取消时间(已取消状态)',
    `finish_time` string COMMENT '完成时间(已完成状态)',
    `refund_time` string COMMENT '退款时间(退款中状态)',
    `refund_finish_time` string COMMENT '退款完成时间(退款完成状态)',
    `province_id` string COMMENT '省份ID',
    `activity_id` string COMMENT '活动ID',
    `original_total_amount` string COMMENT '原价金额',
    `benefit_reduce_amount` string COMMENT '优惠金额',
    `feight_fee` string COMMENT '运费',
    `final_total_amount` decimal(10,2) COMMENT '订单金额'
) 
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_order_info/'
tblproperties ("parquet.compression"="lzo");

在这里插入图片描述

set hive.exec.dynamic.partition.mode=nonstrict; 
insert overwrite table dwd_fact_order_info partition(dt) 
select 
	if(new.id is null,old.id,new.id), 
	if(new.order_status is null,old.order_status,new.order_status), 
	if(new.user_id is null,old.user_id,new.user_id), 
	if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no), 
	if(new.tms['1001'] is null,old.create_time,new.tms['1001']),
	if(new.tms['1002'] is null,old.payment_time,new.tms['1002']), 
	if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']), 
	if(new.tms['1004'] is null,old.finish_time,new.tms['1004']), 
	if(new.tms['1005'] is null,old.refund_time,new.tms['1005']), 
	if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']), 
	if(new.province_id is null,old.province_id,new.province_id), 
	if(new.activity_id is null,old.activity_id,new.activity_id), 
	if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount), 
	if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount), 
	if(new.feight_fee is null,old.feight_fee,new.feight_fee), 
	if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount), 
	date_format(if(new.tms['1001'] is null,old.create_time,new.tms['1001']),'yyyy-MM-dd') 
from ( 
	select 
		id,
		order_status, 
		user_id, 
		out_trade_no, 
		create_time, 
		payment_time, 
		cancel_time, 
		finish_time, 
		refund_time, 
		refund_finish_time, 
		province_id, 
		activity_id, 
		original_total_amount, 
		benefit_reduce_amount, 
		feight_fee, 
		final_total_amount 
	from dwd_fact_order_info 
	where dt 
	in 
	( 
		select 
			date_format(create_time,'yyyy-MM-dd') 
		from ods_order_info 
		where dt='2020-03-10' 
	) 
)old 
full outer join 
( 
	select 
		info.id, 
		info.order_status, 
		info.user_id, 
		info.out_trade_no, 
		info.province_id, 
		act.activity_id, 
		log.tms, 
		info.original_total_amount, 
		info.benefit_reduce_amount, 
		info.feight_fee, 
		info.final_total_amount 
	from 
	( 
		select 
			order_id, 
			str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') tms 
		from ods_order_status_log 
		where dt='2020-03-10' 
		group by order_id 
	)log 
	join ( 
		select * from ods_order_info where dt='2020-03-10' 
	)info 
	on log.order_id=info.id 
	left join 
	( 
		select * from ods_activity_order where dt='2020-03-10' 
	)act 
	on log.order_id=act.order_id 
)new 
on old.id=new.id;
insert overwrite table dwd_order_info partition(dt)
select
    nvl(new.id,old.id),
    nvl(new.order_status,old.order_status),
    nvl(new.user_id,old.user_id),
    nvl(new.province_id,old.province_id),
    nvl(new.payment_way,old.payment_way),
    nvl(new.delivery_address,old.delivery_address),
    nvl(new.out_trade_no,old.out_trade_no),
    nvl(new.tracking_no,old.tracking_no),
    nvl(new.create_time,old.create_time),
    nvl(new.payment_time,old.payment_time),
    nvl(new.cancel_time,old.cancel_time),
    nvl(new.finish_time,old.finish_time),
    nvl(new.refund_time,old.refund_time),
    nvl(new.refund_finish_time,old.refund_finish_time),
    nvl(new.expire_time,old.expire_time),
    nvl(new.feight_fee,old.feight_fee),
    nvl(new.feight_fee_reduce,old.feight_fee_reduce),
    nvl(new.activity_reduce_amount,old.activity_reduce_amount),
    nvl(new.coupon_reduce_amount,old.coupon_reduce_amount),
    nvl(new.original_amount,old.original_amount),
    nvl(new.final_amount,old.final_amount),
    case
        when new.cancel_time is not null then date_format(new.cancel_time,'yyyy-MM-dd')
        when new.finish_time is not null and date_add(date_format(new.finish_time,'yyyy-MM-dd'),7)='2020-06-15' and new.refund_time is null then '2020-06-15'
        when new.refund_finish_time is not null then date_format(new.refund_finish_time,'yyyy-MM-dd')
        when new.expire_time is not null then date_format(new.expire_time,'yyyy-MM-dd')
        else '9999-99-99'
    end
    --COALESCE(new.cancel_time,new.finish_time,new.refund_finish_time,new.expire_time,'9999-99-99') 
from
(
    select
        id,
        order_status,
        user_id,
        province_id,
        payment_way,
        delivery_address,
        out_trade_no,
        tracking_no,
        create_time,
        payment_time,
        cancel_time,
        finish_time,
        refund_time,
        refund_finish_time,
        expire_time,
        feight_fee,
        feight_fee_reduce,
        activity_reduce_amount,
        coupon_reduce_amount,
        original_amount,
        final_amount
    from dwd_order_info
    where dt='9999-99-99'
)old
full outer join
(
    select
        oi.id,
        oi.order_status,
        oi.user_id,
        oi.province_id,
        oi.payment_way,
        oi.delivery_address,
        oi.out_trade_no,
        oi.tracking_no,
        oi.create_time,
        times.ts['1002'] payment_time,
        times.ts['1003'] cancel_time,
        times.ts['1004'] finish_time,
        times.ts['1005'] refund_time,
        times.ts['1006'] refund_finish_time,
        oi.expire_time,
        feight_fee,
        feight_fee_reduce,
        activity_reduce_amount,
        coupon_reduce_amount,
        original_amount,
        final_amount
    from
    (
        select
            *
        from ods_order_info
        where dt='2020-06-15'
    )oi
    left join
    (
        select
            order_id,
            str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') ts
        from ods_order_status_log
        where dt='2020-06-15'
        group by order_id
    )times
    on oi.id=times.order_id
)new
on old.id=new.id;

 常用函数

select concat_ws('-','a','b');
a-b

str_to_map('1001=2020-03-29,1002=2020-03-29',  ','  ,  '=')
输出
{"1001":"2020-03-29","1002":"2020-03-29"}

hive (gmall)> select order_id, concat(order_status,'=', operate_time) from ods_order_status_log where dt='2020-03-29'; 

3210    1001=2020-03-29 00:00:00.0
3211    1001=2020-03-29 00:00:00.0 

hive (gmall)> select order_id, collect_set(concat(order_status,'=',operate_time)) from ods_order_status_log where dt='2020-03-29' group by order_id;

3210    ["1001=2020-03-29 00:00:00.0","1002=2020-03-29 00:00:00.0","1005=2020-03-29 00:00:00.0"]
3211    ["1001=2020-03-29 00:00:00.0","1002=2020-03-29 00:00:00.0","1004=2020-03-29  

hive (gmall)> 
select order_id, concat_ws(',', collect_set(concat(order_status,'=',operate_time))) from ods_order_status_log where dt='2020-03-29' group by order_id;

3210    1001=2020-03-29 00:00:00.0,1002=2020-03-29 00:00:00.0,1005=2020-03-29 00:00:00.0
3211    1001=2020-03-29 00:00:00.0,1002=2020-03-29 00:00:00.0,1004=2020-03-29 00:00:00.0
 
hive (gmall)>
select order_id, str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))), ','  ,  '=') from ods_order_status_log where dt='2020-03-29' group by order_id;

3210    {"1001":"2020-03-29 00:00:00.0","1002":"2020-03-29 00:00:00.0","1005":"2020-03-29 00:00:00.0"}
3211    {"1001":"2020-03-29 00:00:00.0","1002":"2020-03-29 00:00:00.0","1004":"2020-03-29 00:00:00.0"}
 

  • 2
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

四月天03

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值