创建DWS层每日设备行为表
每日设备行为,主要按照设备id统计。
-- dws层-每日设备行为
drop table if exists dws_uv_detail_daycount;
create external table dws_uv_detail_daycount
(
`mid_id` string COMMENT '设备id',
`brand` string COMMENT '手机品牌',
`model` string COMMENT '手机型号',
`login_count` bigint COMMENT '活跃次数',
`page_stats` array<struct<page_id:string,page_count:bigint>> COMMENT '页面访问统计'
) COMMENT '每日设备行为表'
partitioned by(dt string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_daycount'
tblproperties ("parquet.compression"="lzo");
数据加载
-- dws层-每日设备行为数据加载
-- with > 创建一个临时表,除开头临时表需要加with,后面都不需要
with
-- tmp_start > 获取活跃次数
tmp_start as
(
select
mid_id,
brand,
model,
count(*) login_count
from dwd_start_log
where dt='2020-06-14'
group by mid_id,brand,model
),
-- tmp_page > 页面访问统计 > array<struct<page_id:string,page_count:bigint>>
tmp_page as
(
select
mid_id,
brand,
model,
-- 用 named_struct 可以直接指定列名
collect_set(named_struct('page_id',page_id,'page_count',page_count)) page_stats
from
(
-- struct<page_id:string,page_count:bigint>
select
mid_id,
brand,
model,
page_id,
count(*) page_count
from dwd_page_log
where dt='2020-06-14'
group by mid_id,brand,model,page_id
)tmp
group by mid_id,brand,model
)
insert overwrite table dws_uv_detail_daycount partition(dt='2020-06-14')
select
-- full outer join > 可能导致一边没有,谁有就取谁 反正最后都是设备字段
nvl(tmp_start.mid_id,tmp_page.mid_id),
nvl(tmp_start.brand,tmp_page.brand),
nvl(tmp_start.model,tmp_page.model),
tmp_start.login_count,
tmp_page.page_stats
from tmp_start
full outer join tmp_page
on tmp_start.mid_id=tmp_page.mid_id
and tmp_start.brand=tmp_page.brand
and tmp_start.model=tmp_page.model;
创建DWT层设备主题宽表
-- dwt-设备主题宽表
drop table if exists dwt_uv_topic;
create external table dwt_uv_topic
(
`mid_id` string comment '设备id',
`brand` string comment '手机品牌',
`model` string comment '手机型号',
`login_date_first` string comment '首次活跃时间',
`login_date_last` string comment '末次活跃时间',
`login_day_count` bigint comment '当日活跃次数',
`login_count` bigint comment '累积活跃天数'
) COMMENT '设备主题宽表'
stored as parquet
location '/warehouse/gmall/dwt/dwt_uv_topic'
tblproperties ("parquet.compression"="lzo");
数据导入
-- dwt-数据导入-设备主题
insert overwrite table dwt_uv_topic
select
nvl(new.mid_id,old.mid_id),
nvl(new.model,old.model),
nvl(new.brand,old.brand),
-- 因为是full outer join > 首次活跃时间 > 旧数据 mid 为空,首次活跃就时今天的数据,不为空就是老的数据
if(old.mid_id is null,'2020-06-14',old.login_date_first),
-- 末次活跃时间 > 新数据不为空,则就是今天的数据,否则就是旧数据的末次
if(new.mid_id is not null,'2020-06-14',old.login_date_last),
-- 当日活跃次数
if(new.mid_id is not null, new.login_count,0),
-- 累积活跃天数
nvl(old.login_count,0)+if(new.login_count>0,1,0)
from
(
select
*
from dwt_uv_topic
)old
full outer join
(
select
*
from dws_uv_detail_daycount
where dt='2020-06-14'
)new
on old.mid_id=new.mid_id;