大数据之数据仓库建设（二）

大数据同盟会

已于 2023-04-18 15:40:27 修改

阅读量2.3k

点赞数

分类专栏：数据仓库文章标签：数据仓库

于 2021-07-19 23:40:25 首次发布

本文链接：https://blog.csdn.net/chuan129/article/details/118917274

版权

数据仓库专栏收录该内容

15 篇文章 6 订阅

订阅专栏

一、DWS 层开发

它的建模思想，就是为最终需求计算来提供支持服务，所以建模相对灵活。
常见建模方法：

1.维度集成（建宽表）：

事实表中，将各种维度 id，和维度表关联后换成各种维度值，有可能将多个不同主题的事实表进行关

2、主题轻度聚合：

对明细按“特定主题”进行轻度聚合计算，为后续大量相关主题的统计报表提供复用的便利
比如，为各种流量统计报表计算，设计：流量会话聚合、流量用户聚合

3、主题划分：

比如，我们的行为日志，里面包含几十种事件，有时候，对于一些关键事件，分析师需要对他们进行各种重点、深度分析，此时，不需要其他事件数据，则可以专门提取出这类事件数据，放入专门的表中（主题表）

比如：我们的行为事件表，可以抽出：

流量事件表
收藏事件表
广告事件表

事件一旦划分为各种主题表，则每一个事件主题表都可以将 properties 字段进行结构化扁平化，为后续相应事件主题的各类分析需求，提供便利

pv： page view，即页面浏览量;
uv： unique visitor 即独立访客；

二、ADS层开发

1、流量会话聚合表

表模型：

在这里插入图片描述

guid,sessionid,起始时间,结束时间,进入页,跳出页,访问页数,新老标记,省,市,区,设备型号,…

– 源表：dwd17.app_action_detail
– 目标：dws17.app_trf_agr_session

– 目标表模型创建

CREATE DATABASE dws17;
CREATE TABLE dws17.app_trf_agr_session(
	guid              string   ,
	session_id        string   ,
	start_ts          bigint   ,
	end_ts            bigint   ,
	first_page_id     string   ,
	last_page_id      string   ,
	pv_cnt            int      ,
	isnew             int      ,
	hour_itv          int      ,
	country           string   ,
	province          string   ,
	city              string   ,
	region            string   ,
	device_type       string   
)
PARTITIONED BY (dt string)
STORED AS PARQUET
;

计算逻辑：

将同一个人的同一个会话，进行聚合
有了这个 dws 表后，就可以支撑很多很多报表的开发

WITH tmp1 AS 
(
SELECT
  guid                                                                           ,
  sessionid  as session_id                                                       ,
  min(ts)    as start_ts                                                         ,
  hour(from_unixtime(cast(min(ts)/1000 as bigint),'yyyy-MM-dd HH:mm:ss')) as hour_itv ,
  max(ts)    as end_ts                                                           ,
  min(isnew) as isnew                                                            ,
  count(if(eventid='pageView',1,null))  as pv_cnt                                                                                          
FROM dwd17.app_action_detail 
WHERE dt='2020-10-07'
GROUP BY  guid,sessionid
) 
,tmp2 AS 
(
   SELECT
     guid               as guid           ,
     session_id         as session_id     ,
     min(first_page_id) as first_page_id  ,
     min(last_page_id)  as last_page_id   ,
     min(country)       as country        ,
     min(province)      as province       ,
     min(city)          as city           ,
     min(region)        as region         ,
     min(device_type)   as device_type
   FROM
     (
        SELECT
          guid                                    ,
          sessionid             as  session_id    ,
          first_value(properties['pageid'])  over(partition by guid,sessionid order by ts asc)  as first_page_id  ,
          first_value(properties['pageid'])  over(partition by guid,sessionid order by ts desc) as last_page_id   ,
          first_value(country)  over(partition by guid,sessionid order by ts asc)               as country        ,
          first_value(province)  over(partition by guid,sessionid order by ts asc)              as province       ,
          first_value(city)  over(partition by guid,sessionid order by ts asc)                  as city           ,
          first_value(region)  over(partition by guid,sessionid order by ts asc)                as region         ,
          first_value(devicetype)  over(partition by guid,sessionid order by ts asc)            as device_type          
        FROM dwd17.app_action_detail 
        WHERE dt='2020-10-07' AND eventid='pageView'
     ) o
   GROUP BY guid,session_id
)

INSERT INTO TABLE dws17.app_trf_agr_session PARTITION(dt='2020-10-07')

SELECT
   tmp1.guid               ,
   tmp1.session_id         ,
   tmp1.start_ts           ,
   tmp1.end_ts             ,
   tmp2.first_page_id      ,
   tmp2.last_page_id       ,
   tmp1.pv_cnt             ,
   tmp1.isnew              ,
   tmp1.hour_itv           ,
   tmp2.country            ,
   tmp2.province           ,
   tmp2.city               ,
   tmp2.region             ,
   tmp2.device_type        
FROM 
   tmp1 
join 
   tmp2  
on tmp1.guid=tmp2.guid and tmp1.session_id=tmp2.session_id
;

查询数据，写两个sql, 分别起一个别名tmp1， tmp2，然后对这两个表进行关联查询

脚步部署：

#!/bin/bash

########################################################
#                                                      #
#  @author hunter@doitedu                              #
#  @date   2020-10-13                                  #
#  @desc   app日志ods表加载到dwd计算任务启动脚本       #
#                                                      #
########################################################

export HIVE_HOME=/opt/apps/hive-3.1.2/

DT=`date -d'-1 day' +%Y-%m-%d`

if [ $1 ]
then
DT=$1
fi

${HIVE_HOME}/bin/hive -e "
WITH tmp1 AS 
(
SELECT
  guid                                                                           ,
  sessionid  as session_id                                                       ,
  min(ts)    as start_ts                                                         ,
  hour(from_unixtime(cast(min(ts)/1000 as bigint),'yyyy-MM-dd HH:mm:ss')) as hour_itv ,
  max(ts)    as end_ts                                                           ,
  min(isnew) as isnew                                                            ,
  count(if(eventid='pageView',1,null))  as pv_cnt                                                                                          
FROM dwd17.app_action_detail 
WHERE dt='${DT}'
GROUP BY  guid,sessionid
) 
,tmp2 AS 
(
   SELECT
     guid               as guid           ,
     session_id         as session_id     ,
     min(first_page_id) as first_page_id  ,
     min(last_page_id)  as last_page_id   ,
     min(country)       as country        ,
     min(province)      as province       ,
     min(city)          as city           ,
     min(region)        as region         ,
     min(device_type)   as device_type
   FROM
     (
        SELECT
          guid                                    ,
          sessionid             as  session_id    ,
          first_value(properties['pageid'])  over(partition by guid,sessionid order by ts asc)  as first_page_id  ,
          first_value(properties['pageid'])  over(partition by guid,sessionid order by ts desc) as last_page_id   ,
          first_value(country)  over(partition by guid,sessionid order by ts asc)               as country        ,
          first_value(province)  over(partition by guid,sessionid order by ts asc)              as province       ,
          first_value(city)  over(partition by guid,sessionid order by ts asc)                  as city           ,
          first_value(region)  over(partition by guid,sessionid order by ts asc)                as region         ,
          first_value(devicetype)  over(partition by guid,sessionid order by ts asc)            as device_type          
        FROM dwd17.app_action_detail 
        WHERE dt='${DT}' AND eventid='pageView'
     ) o
   GROUP BY guid,session_id
)

INSERT INTO TABLE dws17.app_trf_agr_session PARTITION(dt='${DT}')

SELECT
   tmp1.guid               ,
   tmp1.session_id         ,
   tmp1.start_ts           ,
   tmp1.end_ts             ,
   tmp2.first_page_id      ,
   tmp2.last_page_id       ,
   tmp1.pv_cnt             ,
   tmp1.isnew              ,
   tmp1.hour_itv           ,
   tmp2.country            ,
   tmp2.province           ,
   tmp2.city               ,
   tmp2.region             ,
   tmp2.device_type        
FROM 
   tmp1 
join 
   tmp2  
on tmp1.guid=tmp2.guid and tmp1.session_id=tmp2.session_id
"

if [ $? -eq 0 ]
then 
echo "congratulations! 任务执行成功！ 邮件已发送至admin@51doit.com"
else
echo "节哀顺变! 任务失败! 邮件已发送至admin@51doit.com"
fi

2、访问次数分析

在这里插入图片描述

步骤一：建表

CREATE TABLE dws17.app_trf_session_statistic(
	province     string,
	device_type  string,
	session_cnt  int,
	session_avg  double
)
PARTITIONED BY (dt string)
STORED AS PARQUET
;

步骤二：计算逻辑

insert into table dws17.app_trf_session_statistic
select 
   province,
   device_type,
   count(1) as session_count,
   round(count(1)/count(distinct guid),2) as session_avg,
from
    dws17.app_trf_agr_session
where 
    dt='2020-10-07'
group by province,device_type;

3、回头访客数分析
在这里插入图片描述
步骤一：建表

create table dws17.app_trf_reback_statistic(
   province      string,
   city          string,
   device_type   string,
   uv_cnt        int,
   reback_cnt    int

)
PARTITIONED BY (dt string)
stored as parquet ;

步骤二：计算逻辑

insert into table dws17.app_trf_reback_statistic partition (dt='2020-10-07')
select 
   province,
   city,
   device_type,
   count(1) as uv_cnt,
   count(if(session_cnt>1,1,null)) as reback_cnt
from 
   (select 
        province,
        city,
        device_type,
        guid,
        count(1) as session_cnt
    from 
        dws17.app_trf_agr_session
    where 
        dt='2020-10-07'
    group by province,city,device_type,guid,
   ) o 
group by  province,city,device_type

4、多维数据立方体

上面的访客分析，除了按省市统计分析，也会按省统计分析、手机号统计分析，所以可以查询出可能出现的所有组合情况，使用高阶聚合函数：

with cube: 查询选中列的所有组合情况

步骤一：建表

步骤二：逻辑计算

insert into table dws17.app_trf_session_cube partition (dt='2020-10-07')
select 
   province,city,region,device_type,isnew,
   count(1) as session_cnt,
   count(count(1)/count(distinct guid),2) as session_avg
from 
   dws17.app_trf_agr_session
where 
    dt='2020-10-07'
group by  province,city,device_type,isnew  
with cube;

步骤三：查询

grouping sets: 后面跟选中列自己定义需要出现的组合情况

步骤一：建表

--一次性做完所有的访问次数和平均每人访问次数的多维分析
create table dws17.app_trf_session_cube(
   province    string,
   city        string,
   region      string,
   device_type string,
   isnew       int,
   session_cnt int ,   --访问次数
   session_avg double  --平均每人访问次数
)

PARTITIONED BY (dt string)
stored as parquet ;

步骤二：逻辑计算

insert into table dws17.app_trf_session_cube partition (dt='2020-10-07')
select 
   province,city,region,device_type,isnew,
   count(1) as session_cnt,
   count(count(1)/count(distinct guid),2) as session_avg
from 
   dws17.app_trf_agr_session
where 
    dt='2020-10-07'
group by  province,city,device_type,isnew  
grouping sets ((),(province),(province,city),(province,city,region),(device_type),(device_type,isnew))

步骤三：查询
在这里插入图片描述

with rollup: 显示选中列某个层次的结构的聚合

步骤一：建表

步骤二：逻辑计算

insert into table dws17.app_trf_session_cube partition (dt='2020-10-07')
select 
   province,city,region,device_type,
   count(1) as session_cnt,
   count(count(1)/count(distinct guid),2) as session_avg
from 
   dws17.app_trf_agr_session
where 
    dt='2020-10-07'
group by  province,city,device_type
with rollup;

步骤三：查询

5、主题事件表

比如说，根据省市统计点赞的用户，可能会出现用户重复的时候

数据统计：

with tmp as (
   select 
       province,
       city,
       region,
       lanmu,
       count(1) as session_cnt,
       count(distinct guide) as user_cnt,
       collect_set(guide) as user_lst
    from 
       test.count_distinct_reuse
    group by province,city,region,lanmu,
)

数据格式：
在这里插入图片描述
计算逻辑：

select 
    province,
    city,
    region,
    sum(event_cnt) as event_cnt,
    map_keys(str_to_map(concat_ws(',',collect_set(concat_ws(',',user_lst))))) as user_lst,
    size(map_keys(str_to_map(concat_ws(',',collect_set(concat_ws(',',user_lst)))))) as user_cnt
from 
    tmp 
group by   province,city,region

计算结果：
在这里插入图片描述

三、DWS 层用户活跃区间表

建立一个中间表，将用户的连续活跃起始日和结束日分别记录在表里，可以支撑各类 “活跃分析主题”的报表计

创建区间表（测试使用）

create table test.act_range(
	guid       string,
	first_dt   string,
	rng_start  string,
	rng_end    string
)
partitioned by (dt string)
row format delimited fields terminated by ','
;

模拟区间表数据

vi rng.11
a,2020-10-01,2020-10-01,2020-10-03
a,2020-10-01,2020-10-06,2020-10-08
x,2020-10-01,2020-10-01,2020-10-03
x,2020-10-01,2020-10-06,2020-10-08
b,2020-10-01,2020-10-01,2020-10-06
b,2020-10-01,2020-10-09,9999-12-31
c,2020-10-05,2020-10-05,9999-12-31

load data local inpath '/root/rng.11' into table test.act_range partition(dt='2020-10-11');

创建日活表

create table test.dau(guid string)
	partitioned by (dt string)
row format delimited fields terminated by ','
;

模拟日活表数据

vi dau.12
a
c
d
x

load data local inpath '/root/dau.12' into table test.dau partition(dt='2020-10-12');

计算逻辑

对区间表和日活表进行full join

+---------+-------------+--------------+-------------+-------------+---------+-------------+
| a.guid  | a.first_dt  | a.rng_start  |  a.rng_end  |    a.dt     | b.guid  |    b.dt     |
+---------+-------------+--------------+-------------+-------------+---------+-------------+
| a       | 2020-10-01  | 2020-10-06   | 2020-10-08  | 2020-10-11  | a       | 2020-10-12  |
| a       | 2020-10-01  | 2020-10-01   | 2020-10-03  | 2020-10-11  | a       | 2020-10-12  |
| b       | 2020-10-01  | 2020-10-09   | 9999-12-31  | 2020-10-11  | NULL    | NULL        |
| b       | 2020-10-01  | 2020-10-01   | 2020-10-06  | 2020-10-11  | NULL    | NULL        |
| c       | 2020-10-05  | 2020-10-05   | 9999-12-31  | 2020-10-11  | c       | 2020-10-12  |
| NULL    | NULL        | NULL         | NULL        | NULL        | d       | 2020-10-12  |
| x       | 2020-10-01  | 2020-10-06   | 2020-10-08  | 2020-10-11  | x       | 2020-10-12  |
| x       | 2020-10-01  | 2020-10-01   | 2020-10-03  | 2020-10-11  | x       | 2020-10-12  |
+---------+-------------+--------------+-------------+-------------+---------+-------------+

对空值进行判断取值

SELECT
  nvl(a.guid,b.guid) as guid,
  nvl(a.first_dt,b.dt) as first_dt,
  nvl(a.rng_start,b.dt) as rng_start,
  if(a.rng_end = '9999-12-31' and b.guid is null,a.dt,nvl(a.rng_end,'9999-12-31')) as rng_end
FROM 
  test.act_range a 
full outer join 
   test.dau b  
on  a.guid=b.guid
where a.dt='2020-10-11'  or  b.dt='2020-10-12'

+-------+-------------+-------------+-------------+
| guid  |  first_dt   |  rng_start  |   rng_end   |
+-------+-------------+-------------+-------------+
| a     | 2020-10-01  | 2020-10-06  | 2020-10-08  |
| a     | 2020-10-01  | 2020-10-01  | 2020-10-03  |
| b     | 2020-10-01  | 2020-10-09  | 2020-10-11  |
| b     | 2020-10-01  | 2020-10-01  | 2020-10-06  |
| c     | 2020-10-05  | 2020-10-05  | 9999-12-31  |
| d     | 2020-10-12  | 2020-10-12  | 9999-12-31  |
| x     | 2020-10-01  | 2020-10-06  | 2020-10-08  |
| x     | 2020-10-01  | 2020-10-01  | 2020-10-03  |
+-------+-------------+-------------+-------------+

– 新增区间的单独处理

1. 拿 T-1 日的区间记录表 FULL JOIN T 日的日活表
根据不同情况，取值即可；（但是会缺少一部数据：最近没活跃但今天来了，要生成新的
区间了）；
2. 从 T-1 日区间记录表中，过滤出最近没活跃的人，JOIN T 日的日活，得到（最近没活跃但
今天活跃了），为这些人每人生成一条新的区间记录即可；

select
	a.guid          as guid,
	a.first_dt      as first_dt,
	b.dt            as  rng_start,
	'9999-12-31'    as  rng_end
from 
	(select
	guid,
	first_dt
from test.act_range
where dt='2020-10-11'
group by guid,first_dt
having max(rng_end) !='9999-12-31'
) a

join  
(
select guid,dt
from test.dau 
where dt='2020-10-12'
) b
on a.guid = b.guid

脚本部署

#!/bin/bash

# create table dws17.app_useract_range(
# guid       string,
# first_dt   string,
# rng_start  string,
# rng_end    string
# )
# partitioned by (dt string)
# stored as parquet
# ;

########################################################
#                                                      #
#  @author hunter@doitedu                              #
#  @date   2020-10-13                                  #
#  @desc   app dws.用户连续活跃区间记录表启动脚本      #
#                                                      #
########################################################

export HIVE_HOME=/opt/apps/hive-3.1.2/

DT_CALC=`date -d'-1 day' +%Y-%m-%d`
DT_HIST=`date -d'-2 day' +%Y-%m-%d`

if [[ $1 && $2 ]]
then
DT_CALC=$1
DT_HIST=$2
fi

${HIVE_HOME}/bin/hive -e "
WITH dau as 
(
  select
     guid,dt
  from  dwd17.app_action_detail 
  where dt='${DT_CALC}'
  group by guid,dt
),
rng as (
select guid,first_dt,rng_start,rng_end,dt from  dws17.app_useract_range where dt='${DT_HIST}'
)

INSERT INTO TABLE dws17.app_useract_range PARTITION(dt='${DT_CALC}')
SELECT
  nvl(a.guid,b.guid) as guid,
  nvl(a.first_dt,b.dt) as first_dt,
  nvl(a.rng_start,b.dt) as rng_start,
  if(a.rng_end = '9999-12-31' and b.guid is null,a.dt,nvl(a.rng_end,'9999-12-31')) as rng_end
FROM 
  -- T-1日的活跃区间表
  rng a
full join 
  dau b
on  a.guid=b.guid

union all

SELECT
a.guid          as  guid,
a.first_dt      as  first_dt,
b.dt            as  rng_start,
'9999-12-31'    as  rng_end
FROM 
(
select
  guid,
  first_dt
from rng
group by 
  guid,
  first_dt
having max(rng_end) !='9999-12-31'
) a
join
dau  b
on a.guid = b.guid
"

if [ $? -eq 0 ]
then 
echo "congratulations! 任务执行成功！ 邮件已发送至admin@51doit.com"
else
echo "节哀顺变! 任务失败! 邮件已发送至admin@51doit.com"
fi

四、ADS层用户连续活跃月报表

由活跃结束日减去活跃开始日，计算日期差

建表

CREATE TABLE ads17.app_useract_stat_m(
	calc_date         string,
	month             string,
	continuous_5days   int,   -- 本月内连续活跃>=5天的人数
	continuous_7days   int,   -- 本月内连续活跃>=7天的人数
	continuous_14days  int,   -- 本月内连续活跃>=14天的人数
	continuous_20days  int,
	continuous_30days  int
)
STORED AS  PARQUET
;

计算逻辑

WITH tmp AS (
	SELECT
	 guid,
	 max(datediff(if(rng_end='9999-12-31','2020-10-07',rng_end),if(rng_start<'2020-10-01','2020-10-01',rng_start))+1) as max_continuous_days
	FROM dws17.app_useract_range
WHERE dt='2020-10-07' AND rng_end >= '2020-10-01'
GROUP BY guid
)

结果插入

INSERT INTO TABLE ads17.app_useract_stat_m

	SELECT
	'2020-10-07'                              as calc_date,
	month('2020-10-07')                       as month,
	count(if(max_continuous_days>=5,1,null))  as continuous_5days  ,   -- 本月内连续活跃>=5天的人数
	count(if(max_continuous_days>=7,1,null))  as continuous_7days  ,   -- 本月内连续活跃>=7天的人数
	count(if(max_continuous_days>=14,1,null)) as continuous_14days ,   -- 本月内连续活跃>=14天的人数
	count(if(max_continuous_days>=20,1,null)) as continuous_20days ,
	count(if(max_continuous_days>=30,1,null)) as continuous_30days 
	FROM tmp
;

五、ADS层新用户留存月报表

当日活跃的用户，使用活跃结束日减去首登日，求数量

在这里插入图片描述
本横表不方便直接计算，可设计如下纵表（竖表) ,每日滚动计算
每天计算的是： x 日–>计算日的留存数据即可

在这里插入图片描述

建表

CREATE TABLE ads17.app_userret_stat_d(
	calc_dt       string,
	ret_start_dt  string,
	ret_days      string,
	ret_user_cnt  int
)
STORED AS PARQUET
;

计算逻辑

INSERT INTO TABLE ads17.app_userret_stat_d
	SELECT
	'2020-10-07'      AS calc_dt,
	first_dt          AS ret_start_dt,
	if(datediff('2020-10-07',first_dt)>30,'30+',datediff('2020-10-07',first_dt)) AS ret_days,
	count(1)          AS ret_user_cnt
	FROM dws17.app_useract_range
	WHERE 
	  dt='2020-10-07' 
	 AND 
	  rng_end='9999-12-31'   -- 只有这种才是今天有活跃的，才算入当天的留存报表 
GROUP BY first_dt,if(datediff('2020-10-07',first_dt)>30,'30+',datediff('2020-10-07',first_dt))