一、DWS 层开发
它的建模思想,就是为最终需求计算来提供支持服务,所以建模相对灵活。
常见建模方法:
1.维度集成(建宽表):
事实表中,将各种维度 id,和维度表关联后换成各种维度值,有可能将多个不同主题的事实表进行关
2、主题轻度聚合:
对明细按“特定主题”进行轻度聚合计算,为后续大量相关主题的统计报表提供复用的便利
比如,为各种流量统计报表计算,设计:流量会话聚合、流量用户聚合
3、主题划分:
比如,我们的行为日志,里面包含几十种事件,有时候,对于一些关键事件,分析师需要对他们进行各种重点、深度分析,此时,不需要其他事件数据,则可以专门提取出这类事件数据,放入专门的表中(主题表)
比如:我们的行为事件表,可以抽出:
流量事件表
收藏事件表
广告事件表
事件一旦划分为各种主题表,则每一个事件主题表都可以将 properties 字段进行结构化扁平化,为后续相应事件主题的各类分析需求,提供便利
pv: page view,即页面浏览量;
uv: unique visitor 即独立访客;
二、ADS层开发
1、流量会话聚合表
表模型:
guid,sessionid,起始时间,结束时间,进入页,跳出页,访问页数,新老标记,省,市,区,设备型号,…
– 源表:dwd17.app_action_detail
– 目标:dws17.app_trf_agr_session
– 目标表模型创建
CREATE DATABASE dws17;
CREATE TABLE dws17.app_trf_agr_session(
guid string ,
session_id string ,
start_ts bigint ,
end_ts bigint ,
first_page_id string ,
last_page_id string ,
pv_cnt int ,
isnew int ,
hour_itv int ,
country string ,
province string ,
city string ,
region string ,
device_type string
)
PARTITIONED BY (dt string)
STORED AS PARQUET
;
计算逻辑:
将同一个人的同一个会话,进行聚合
有了这个 dws 表后,就可以支撑很多很多报表的开发
WITH tmp1 AS
(
SELECT
guid ,
sessionid as session_id ,
min(ts) as start_ts ,
hour(from_unixtime(cast(min(ts)/1000 as bigint),'yyyy-MM-dd HH:mm:ss')) as hour_itv ,
max(ts) as end_ts ,
min(isnew) as isnew ,
count(if(eventid='pageView',1,null)) as pv_cnt
FROM dwd17.app_action_detail
WHERE dt='2020-10-07'
GROUP BY guid,sessionid
)
,tmp2 AS
(
SELECT
guid as guid ,
session_id as session_id ,
min(first_page_id) as first_page_id ,
min(last_page_id) as last_page_id ,
min(country) as country ,
min(province) as province ,
min(city) as city ,
min(region) as region ,
min(device_type) as device_type
FROM
(
SELECT
guid ,
sessionid as session_id ,
first_value(properties['pageid']) over(partition by guid,sessionid order by ts asc) as first_page_id ,
first_value(properties['pageid']) over(partition by guid,sessionid order by ts desc) as last_page_id ,
first_value(country) over(partition by guid,sessionid order by ts asc) as country ,
first_value(province) over(partition by guid,sessionid order by ts asc) as province ,
first_value(city) over(partition by guid,sessionid order by ts asc) as city ,
first_value(region) over(partition by guid,sessionid order by ts asc) as region ,
first_value(devicetype) over(partition by guid,sessionid order by ts asc) as device_type
FROM dwd17.app_action_detail
WHERE dt='2020-10-07' AND eventid='pageView'
) o
GROUP BY guid,session_id
)
INSERT INTO TABLE dws17.app_trf_agr_session PARTITION(dt='2020-10-07')
SELECT
tmp1.guid ,
tmp1.session_id ,
tmp1.start_ts ,
tmp1.end_ts ,
tmp2.first_page_id ,
tmp2.last_page_id ,
tmp1.pv_cnt ,
tmp1.isnew ,
tmp1.hour_itv ,
tmp2.country ,
tmp2.province ,
tmp2.city ,
tmp2.region ,
tmp2.device_type
FROM
tmp1
join
tmp2
on tmp1.guid=tmp2.guid and tmp1.session_id=tmp2.session_id
;
查询数据,写两个sql, 分别起一个别名tmp1, tmp2, 然后对这两个表进行关联查询
脚步部署:
#!/bin/bash
########################################################
# #
# @author hunter@doitedu #
# @date 2020-10-13 #
# @desc app日志ods表加载到dwd计算任务启动脚本 #
# #
########################################################
export HIVE_HOME=/opt/apps/hive-3.1.2/
DT=`date -d'-1 day' +%Y-%m-%d`
if [ $1 ]
then
DT=$1
fi
${HIVE_HOME}/bin/hive -e "
WITH tmp1 AS
(
SELECT
guid ,
sessionid as session_id ,
min(ts) as start_ts ,
hour(from_unixtime(cast(min(ts)/1000 as bigint),'yyyy-MM-dd HH:mm:ss')) as hour_itv ,
max(ts) as end_ts ,
min(isnew) as isnew ,
count(if(eventid='pageView',1,null)) as pv_cnt
FROM dwd17.app_action_detail
WHERE dt='${DT}'
GROUP BY guid,sessionid
)
,tmp2 AS
(
SELECT
guid as guid ,
session_id as session_id ,
min(first_page_id) as first_page_id ,
min(last_page_id) as last_page_id ,
min(country) as country ,
min(province) as province ,
min(city) as city ,
min(region) as region ,
min(device_type) as device_type
FROM
(
SELECT
guid ,
sessionid as session_id ,
first_value(properties['pageid']) over(partition by guid,sessionid order by ts asc) as first_page_id ,
first_value(properties['pageid']) over(partition by guid,sessionid order by ts desc) as last_page_id ,
first_value(country) over(partition by guid,sessionid order by ts asc) as country ,
first_value(province) over(partition by guid,sessionid order by ts asc) as province ,
first_value(city) over(partition by guid,sessionid order by ts asc) as city ,
first_value(region) over(partition by guid,sessionid order by ts asc) as region ,
first_value(devicetype) over(partition by guid,sessionid order by ts asc) as device_type
FROM dwd17.app_action_detail
WHERE dt='${DT}' AND eventid='pageView'
) o
GROUP BY guid,session_id
)
INSERT INTO TABLE dws17.app_trf_agr_session PARTITION(dt='${DT}')
SELECT
tmp1.guid ,
tmp1.session_id ,
tmp1.start_ts ,
tmp1.end_ts ,
tmp2.first_page_id ,
tmp2.last_page_id ,
tmp1.pv_cnt ,
tmp1.isnew ,
tmp1.hour_itv ,
tmp2.country ,
tmp2.province ,
tmp2.city ,
tmp2.region ,
tmp2.device_type
FROM
tmp1
join
tmp2
on tmp1.guid=tmp2.guid and tmp1.session_id=tmp2.session_id
"
if [ $? -eq 0 ]
then
echo "congratulations! 任务执行成功! 邮件已发送至admin@51doit.com"
else
echo "节哀顺变! 任务失败! 邮件已发送至admin@51doit.com"
fi
2、访问次数分析
步骤一:建表
CREATE TABLE dws17.app_trf_session_statistic(
province string,
device_type string,
session_cnt int,
session_avg double
)
PARTITIONED BY (dt string)
STORED AS PARQUET
;
步骤二:计算逻辑
insert into table dws17.app_trf_session_statistic
select
province,
device_type,
count(1) as session_count,
round(count(1)/count(distinct guid),2) as session_avg,
from
dws17.app_trf_agr_session
where
dt='2020-10-07'
group by province,device_type;
3、回头访客数分析
步骤一:建表
create table dws17.app_trf_reback_statistic(
province string,
city string,
device_type string,
uv_cnt int,
reback_cnt int
)
PARTITIONED BY (dt string)
stored as parquet ;
步骤二:计算逻辑
insert into table dws17.app_trf_reback_statistic partition (dt='2020-10-07')
select
province,
city,
device_type,
count(1) as uv_cnt,
count(if(session_cnt>1,1,null)) as reback_cnt
from
(select
province,
city,
device_type,
guid,
count(1) as session_cnt
from
dws17.app_trf_agr_session
where
dt='2020-10-07'
group by province,city,device_type,guid,
) o
group by province,city,device_type
4、多维数据立方体
上面的访客分析,除了按省市统计分析,也会按省统计分析、手机号统计分析,所以可以查询出可能出现的所有组合情况,使用高阶聚合函数:
with cube: 查询选中列的所有组合情况
步骤一:建表
步骤二:逻辑计算
insert into table dws17.app_trf_session_cube partition (dt='2020-10-07')
select
province,city,region,device_type,isnew,
count(1) as session_cnt,
count(count(1)/count(distinct guid),2) as session_avg
from
dws17.app_trf_agr_session
where
dt='2020-10-07'
group by province,city,device_type,isnew
with cube;
步骤三:查询
grouping sets: 后面跟选中列自己定义需要出现的组合情况
步骤一:建表
--一次性做完所有的访问次数和平均每人访问次数的多维分析
create table dws17.app_trf_session_cube(
province string,
city string,
region string,
device_type string,
isnew int,
session_cnt int , --访问次数
session_avg double --平均每人访问次数
)
PARTITIONED BY (dt string)
stored as parquet ;
步骤二:逻辑计算
insert into table dws17.app_trf_session_cube partition (dt='2020-10-07')
select
province,city,region,device_type,isnew,
count(1) as session_cnt,
count(count(1)/count(distinct guid),2) as session_avg
from
dws17.app_trf_agr_session
where
dt='2020-10-07'
group by province,city,device_type,isnew
grouping sets ((),(province),(province,city),(province,city,region),(device_type),(device_type,isnew))
步骤三:查询
with rollup: 显示选中列某个层次的结构的聚合
步骤一:建表
步骤二:逻辑计算
insert into table dws17.app_trf_session_cube partition (dt='2020-10-07')
select
province,city,region,device_type,
count(1) as session_cnt,
count(count(1)/count(distinct guid),2) as session_avg
from
dws17.app_trf_agr_session
where
dt='2020-10-07'
group by province,city,device_type
with rollup;
步骤三:查询
5、主题事件表
比如说,根据省市统计点赞的用户,可能会出现用户重复的时候
数据统计:
with tmp as (
select
province,
city,
region,
lanmu,
count(1) as session_cnt,
count(distinct guide) as user_cnt,
collect_set(guide) as user_lst
from
test.count_distinct_reuse
group by province,city,region,lanmu,
)
数据格式:
计算逻辑:
select
province,
city,
region,
sum(event_cnt) as event_cnt,
map_keys(str_to_map(concat_ws(',',collect_set(concat_ws(',',user_lst))))) as user_lst,
size(map_keys(str_to_map(concat_ws(',',collect_set(concat_ws(',',user_lst)))))) as user_cnt
from
tmp
group by province,city,region
计算结果:
三、DWS 层 用户活跃区间表
建立一个中间表,将用户的连续活跃起始日和结束日分别记录在表里,可以支撑各类 “活跃分析主题”的报表计
创建区间表(测试使用)
create table test.act_range(
guid string,
first_dt string,
rng_start string,
rng_end string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;
模拟区间表数据
vi rng.11
a,2020-10-01,2020-10-01,2020-10-03
a,2020-10-01,2020-10-06,2020-10-08
x,2020-10-01,2020-10-01,2020-10-03
x,2020-10-01,2020-10-06,2020-10-08
b,2020-10-01,2020-10-01,2020-10-06
b,2020-10-01,2020-10-09,9999-12-31
c,2020-10-05,2020-10-05,9999-12-31
load data local inpath '/root/rng.11' into table test.act_range partition(dt='2020-10-11');
创建日活表
create table test.dau(guid string)
partitioned by (dt string)
row format delimited fields terminated by ','
;
模拟日活表数据
vi dau.12
a
c
d
x
load data local inpath '/root/dau.12' into table test.dau partition(dt='2020-10-12');
计算逻辑
对区间表和日活表进行full join
+---------+-------------+--------------+-------------+-------------+---------+-------------+
| a.guid | a.first_dt | a.rng_start | a.rng_end | a.dt | b.guid | b.dt |
+---------+-------------+--------------+-------------+-------------+---------+-------------+
| a | 2020-10-01 | 2020-10-06 | 2020-10-08 | 2020-10-11 | a | 2020-10-12 |
| a | 2020-10-01 | 2020-10-01 | 2020-10-03 | 2020-10-11 | a | 2020-10-12 |
| b | 2020-10-01 | 2020-10-09 | 9999-12-31 | 2020-10-11 | NULL | NULL |
| b | 2020-10-01 | 2020-10-01 | 2020-10-06 | 2020-10-11 | NULL | NULL |
| c | 2020-10-05 | 2020-10-05 | 9999-12-31 | 2020-10-11 | c | 2020-10-12 |
| NULL | NULL | NULL | NULL | NULL | d | 2020-10-12 |
| x | 2020-10-01 | 2020-10-06 | 2020-10-08 | 2020-10-11 | x | 2020-10-12 |
| x | 2020-10-01 | 2020-10-01 | 2020-10-03 | 2020-10-11 | x | 2020-10-12 |
+---------+-------------+--------------+-------------+-------------+---------+-------------+
对空值进行判断取值
SELECT
nvl(a.guid,b.guid) as guid,
nvl(a.first_dt,b.dt) as first_dt,
nvl(a.rng_start,b.dt) as rng_start,
if(a.rng_end = '9999-12-31' and b.guid is null,a.dt,nvl(a.rng_end,'9999-12-31')) as rng_end
FROM
test.act_range a
full outer join
test.dau b
on a.guid=b.guid
where a.dt='2020-10-11' or b.dt='2020-10-12'
+-------+-------------+-------------+-------------+
| guid | first_dt | rng_start | rng_end |
+-------+-------------+-------------+-------------+
| a | 2020-10-01 | 2020-10-06 | 2020-10-08 |
| a | 2020-10-01 | 2020-10-01 | 2020-10-03 |
| b | 2020-10-01 | 2020-10-09 | 2020-10-11 |
| b | 2020-10-01 | 2020-10-01 | 2020-10-06 |
| c | 2020-10-05 | 2020-10-05 | 9999-12-31 |
| d | 2020-10-12 | 2020-10-12 | 9999-12-31 |
| x | 2020-10-01 | 2020-10-06 | 2020-10-08 |
| x | 2020-10-01 | 2020-10-01 | 2020-10-03 |
+-------+-------------+-------------+-------------+
– 新增区间的单独处理
1. 拿 T-1 日的区间记录表 FULL JOIN T 日的日活表
根据不同情况,取值即可;(但是会缺少一部数据:最近没活跃但今天来了,要生成新的
区间了);
2. 从 T-1 日区间记录表中,过滤出最近没活跃的人,JOIN T 日的日活,得到(最近没活跃但
今天活跃了),为这些人每人生成一条新的区间记录即可;
select
a.guid as guid,
a.first_dt as first_dt,
b.dt as rng_start,
'9999-12-31' as rng_end
from
(select
guid,
first_dt
from test.act_range
where dt='2020-10-11'
group by guid,first_dt
having max(rng_end) !='9999-12-31'
) a
join
(
select guid,dt
from test.dau
where dt='2020-10-12'
) b
on a.guid = b.guid
脚本部署
#!/bin/bash
# create table dws17.app_useract_range(
# guid string,
# first_dt string,
# rng_start string,
# rng_end string
# )
# partitioned by (dt string)
# stored as parquet
# ;
########################################################
# #
# @author hunter@doitedu #
# @date 2020-10-13 #
# @desc app dws.用户连续活跃区间记录表启动脚本 #
# #
########################################################
export HIVE_HOME=/opt/apps/hive-3.1.2/
DT_CALC=`date -d'-1 day' +%Y-%m-%d`
DT_HIST=`date -d'-2 day' +%Y-%m-%d`
if [[ $1 && $2 ]]
then
DT_CALC=$1
DT_HIST=$2
fi
${HIVE_HOME}/bin/hive -e "
WITH dau as
(
select
guid,dt
from dwd17.app_action_detail
where dt='${DT_CALC}'
group by guid,dt
),
rng as (
select guid,first_dt,rng_start,rng_end,dt from dws17.app_useract_range where dt='${DT_HIST}'
)
INSERT INTO TABLE dws17.app_useract_range PARTITION(dt='${DT_CALC}')
SELECT
nvl(a.guid,b.guid) as guid,
nvl(a.first_dt,b.dt) as first_dt,
nvl(a.rng_start,b.dt) as rng_start,
if(a.rng_end = '9999-12-31' and b.guid is null,a.dt,nvl(a.rng_end,'9999-12-31')) as rng_end
FROM
-- T-1日的活跃区间表
rng a
full join
dau b
on a.guid=b.guid
union all
SELECT
a.guid as guid,
a.first_dt as first_dt,
b.dt as rng_start,
'9999-12-31' as rng_end
FROM
(
select
guid,
first_dt
from rng
group by
guid,
first_dt
having max(rng_end) !='9999-12-31'
) a
join
dau b
on a.guid = b.guid
"
if [ $? -eq 0 ]
then
echo "congratulations! 任务执行成功! 邮件已发送至admin@51doit.com"
else
echo "节哀顺变! 任务失败! 邮件已发送至admin@51doit.com"
fi
四、ADS层 用户连续活跃月报表
由活跃结束日减去活跃开始日,计算日期差
建表
CREATE TABLE ads17.app_useract_stat_m(
calc_date string,
month string,
continuous_5days int, -- 本月内连续活跃>=5天的人数
continuous_7days int, -- 本月内连续活跃>=7天的人数
continuous_14days int, -- 本月内连续活跃>=14天的人数
continuous_20days int,
continuous_30days int
)
STORED AS PARQUET
;
计算逻辑
WITH tmp AS (
SELECT
guid,
max(datediff(if(rng_end='9999-12-31','2020-10-07',rng_end),if(rng_start<'2020-10-01','2020-10-01',rng_start))+1) as max_continuous_days
FROM dws17.app_useract_range
WHERE dt='2020-10-07' AND rng_end >= '2020-10-01'
GROUP BY guid
)
结果插入
INSERT INTO TABLE ads17.app_useract_stat_m
SELECT
'2020-10-07' as calc_date,
month('2020-10-07') as month,
count(if(max_continuous_days>=5,1,null)) as continuous_5days , -- 本月内连续活跃>=5天的人数
count(if(max_continuous_days>=7,1,null)) as continuous_7days , -- 本月内连续活跃>=7天的人数
count(if(max_continuous_days>=14,1,null)) as continuous_14days , -- 本月内连续活跃>=14天的人数
count(if(max_continuous_days>=20,1,null)) as continuous_20days ,
count(if(max_continuous_days>=30,1,null)) as continuous_30days
FROM tmp
;
五、ADS层 新用户留存月报表
当日活跃的用户,使用活跃结束日减去首登日,求数量
本横表不方便直接计算,可设计如下纵表(竖表) ,每日滚动计算
每天计算的是: x 日–>计算日 的留存数据即可
建表
CREATE TABLE ads17.app_userret_stat_d(
calc_dt string,
ret_start_dt string,
ret_days string,
ret_user_cnt int
)
STORED AS PARQUET
;
计算逻辑
INSERT INTO TABLE ads17.app_userret_stat_d
SELECT
'2020-10-07' AS calc_dt,
first_dt AS ret_start_dt,
if(datediff('2020-10-07',first_dt)>30,'30+',datediff('2020-10-07',first_dt)) AS ret_days,
count(1) AS ret_user_cnt
FROM dws17.app_useract_range
WHERE
dt='2020-10-07'
AND
rng_end='9999-12-31' -- 只有这种才是今天有活跃的,才算入当天的留存报表
GROUP BY first_dt,if(datediff('2020-10-07',first_dt)>30,'30+',datediff('2020-10-07',first_dt))
六、漏斗模型
漏斗模型主要分析业务转化率的,比如说,客户进入首页------->详情页-------->订单每个页面的转化率
建表:
计算逻辑:
数据插入
脚本部署: