(从电商项目认识数仓体系六)需求及实现

本文详细介绍了电商项目中的三个关键需求——用户活跃、新增和留存的数仓设计。在DWS层,涉及每日、每周和每月的活跃设备分析,每日新增设备明细表的构建。在ADS层,设计了结果表以展示用户行为数据,如活跃设备报表和留存用户统计。通过这些设计,可以有效追踪和分析用户行为以支持业务决策。
摘要由CSDN通过智能技术生成

需求一

用户活跃主题

定义:统计当日、当周、当月活动的每个设备明细

DWS层表及逻辑设计

1 每日活跃设备明细分析

-- 建表语句
DROP TABLE IF EXISTS dws_uv_detail_day;
CREATE TABLE dws_uv_detail_day(
    `mid_id` STRING COMMENT'设备唯一标识',
    `user_id` STRING COMMENT'用户标识',
    `version_code` STRING COMMENT'程序版本号',
    `version_name` STRING COMMENT'程序版本名',
    `lang` STRING COMMENT'系统语言',
    `source` STRING COMMENT'渠道号',
    `os` STRING COMMENT'安卓系统版本',
    `area` STRING COMMENT'区域',
    `model` STRING COMMENT'手机型号',
    `brand` STRING COMMENT'手机品牌',
    `sdk_version` STRING COMMENT'sdkVersion',
    `gmail` STRING COMMENT'gmail',
    `height_width` STRING COMMENT'屏幕宽高',
    `app_time` STRING COMMENT'客户端日志产生时的时间',
    `network` STRING COMMENT'网络模式',
    `lng` STRING COMMENT'经度',
    `lat` STRING COMMENT'纬度',
) COOMENT'活跃用户天明细'
PARTITIONED BY (`dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/dws/des_uv_detail_day'

-- 明细
INSERT OVERWRITE TABLE dws_uv_detail_day PARTITION(dt='2019-02-10')
SELECT
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0] lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area,
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0]sdk_version ,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0] app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0] lng,
    collect_set(lat)[0] lat
FROM dwd_start_log
WHERE dt='2019-02-10'
GROUP BY mid_id;

2 每周活跃设备明细分析

-- 建表语句
DROP TABLE IF EXISTS dws_uv_detail_week;
CREATE TABLE dws_uv_detail_week(
    `mid_id` STRING COMMENT'设备唯一标识',
    `user_id` STRING COMMENT'用户标识',
    `version_code` STRING COMMENT'程序版本号',
    `version_name` STRING COMMENT'程序版本名',
    `lang` STRING COMMENT'系统语言',
    `source` STRING COMMENT'渠道号',
    `os` STRING COMMENT'安卓系统版本',
    `area` STRING COMMENT'区域',
    `model` STRING COMMENT'手机型号',
    `brand` STRING COMMENT'手机品牌',
    `sdk_version` STRING COMMENT'sdkVersion',
    `gmail` STRING COMMENT'gmail',
    `height_width` STRING COMMENT'屏幕宽高',
    `app_time` STRING COMMENT'客户端日志产生时的时间',
    `network` STRING COMMENT'网络模式',
    `lng` STRING COMMENT'经度',
    `lat` STRING COMMENT'纬度',
    `monday_date` STRING COMMENT'周一日期',
    `sunday_date` STRING COMMENT'周日日期',
) COOMENT'活跃用户周明细'
PARTITIONED BY (`week_dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/dws/des_uv_detail_week'

-- 明细
INSERT OVERWRITE TABLE dws_uv_detail_week PARTITION(week_dt)
SELECT
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0] lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area,
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0]sdk_version ,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0] app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0] lng,
    collect_set(lat)[0] lat,
    DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7),
    DATE_ADD(NEXT_DAY('2019-02-10','SUNDAY'),-7),
    CONCAT(DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7),'_',DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-1))
FROM dws_uv_detail_day
WHERE 
    dt>=DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7)
    AND dt<=DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-1)
GROUP BY mid_id;

3 每月活跃设备明细分析

-- 建表语句
DROP TABLE IF EXISTS dws_uv_detail_month;
CREATE TABLE dws_uv_detail_week(
    `mid_id` STRING COMMENT'设备唯一标识',
    `user_id` STRING COMMENT'用户标识',
    `version_code` STRING COMMENT'程序版本号',
    `version_name` STRING COMMENT'程序版本名',
    `lang` STRING COMMENT'系统语言',
    `source` STRING COMMENT'渠道号',
    `os` STRING COMMENT'安卓系统版本',
    `area` STRING COMMENT'区域',
    `model` STRING COMMENT'手机型号',
    `brand` STRING COMMENT'手机品牌',
    `sdk_version` STRING COMMENT'sdkVersion',
    `gmail` STRING COMMENT'gmail',
    `height_width` STRING COMMENT'屏幕宽高',
    `app_time` STRING COMMENT'客户端日志产生时的时间',
    `network` STRING COMMENT'网络模式',
    `lng` STRING COMMENT'经度',
    `lat` STRING COMMENT'纬度'
) COOMENT'活跃用户月明细'
PARTITIONED BY (`month_dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/dws/des_uv_detail_month'

-- 明细
INSERT OVERWRITE TABLE dws_uv_detail_month PARTITION(month_dt)
SELECT
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0] lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area,
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0]sdk_version ,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0] app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0] lng,
    collect_set(lat)[0] lat,
    DATE_FORMAT('2019-02-10','yyyy-MM')
FROM dws_uv_detail_day
WHERE DATE_FORMAT(dt,'yyyy-MM')=DATE_FORMAT('2019-02-10','yyyy-MM')
GROUP BY mid_id;

4 形成脚本

#!/bin/bash
DB=gmail
if [ -n $1 ] ;then
    log_date=$1
else
    log_date=`date -d "-1 day" +%F`
fi

hql="
INSERT OVERWRITE TABLE "${DB}".dws_uv_detail_day PARTITION(dt='$log_date')
...
"
hive -e "${hql}"

ADS层表设计及逻辑

1 结果表

-- 建表语句
DROP TABLE IF EXISTS ads_uv_count;
CREATE EXTERNAL TABLE ads_uv_count(
    `dt` STRING COMMENT '统计日期',
    `day_count` BIGINT COMMENT '当日用户数量',
    `week_count` BIGINT COMMENT '当周用户数量',
    `month_count` BIGINT COMMENT '当月用户数量',
    `is_weekend` BIGINT COMMENT 'Y,N是否是周末,用于得到本周最终结果',
    `is_monthend` BIGINT COMMENT 'Y,N是否是月末,用于得到本月最终结果',
) COMMENT '每日活跃用户数量'
STORED AS PARQUET
LOCATION '/warehouse/gmall/ads/ads_uv_count_day/'

-- 实现逻辑
INSERT OVERWRITE TABLE ads_uv_count
SELECT
    `2019-02-10` dt,
    daycount.ct,
    weekcount.ct,
    monthcount.ct,
    IF(DATE_ADD(NEXT_DAY('2019-02-10','MONTH'),-1)='2019-02-10','Y','N'),
    IF(LAST_DAY('2019-02-10')='2019-02-10','Y','N')
FROM
(
    SELECT
        '2019-02-10' dt,
        COUNT(1) ct
    FROM dws_uv_detail_day
    WHERE dt='2019-02-10'
)daycount
JOIN
(
    SELECT
        '2019-02-10' dt,
        COUNT(1) ct
    FROM dws_uv_detail_week
    WHERE week_dt=CONCAT(DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-7),'_',DATE_ADD(NEXT_DAY('2019-02-10','MONDAY'),-1))
) weekcount
ON daycount.dt=weekcount.dt
JOIN
(
    SELECT
        '2019-02-10' dt,
        COUNT(1) ct
    FROM dws_uv_detail_month
    WHERE month_dt=DATE_FORMAT('2019-02-10','yyyy-MM')
)monthcount
ON daycount.dt=monthcount.dt;

2 形成脚本(与之前脚本类似,不再重复示例)

需求二

用户新增主题

定义:首次联网使用应用的用户。如果一个用户首次打开某app,那这个用户定义为新增用户;卸载再安装的设备,不会被算作一次新增。新增用户包括日新增用户、周新增用户、月新增用户

DWS层表及逻辑设计

1 每日新增设备明细表(总设备表)

-- 建表语句
DROP TABLE IF EXISTS `dws_new_mid_day`;
CREATE TABLE `dws_new_mid_day`
(
    `mid_id` STRING COMMENT'设备唯一标识',
    `user_id` STRING COMMENT'用户标识',
    `version_code` STRING COMMENT'程序版本号',
    `version_name` STRING COMMENT'程序版本名',
    `lang` STRING COMMENT'系统语言',
    `source` STRING COMMENT'渠道号',
    `os` STRING COMMENT'安卓系统版本',
    `area` STRING COMMENT'区域',
    `model` STRING COMMENT'手机型号',
    `brand` STRING COMMENT'手机品牌',
    `sdk_version` STRING COMMENT'sdkVersion',
    `gmail` STRING COMMENT'gmail',
    `height_width` STRING COMMENT'屏幕宽高',
    `app_time` STRING COMMENT'客户端日志产生时的时间',
    `network` STRING COMMENT'网络模式',
    `lng` STRING COMMENT'经度',
    `lat` STRING COMMENT'纬度',
    `create_date` STRING COMMENT'创建时间'
) COOMENT'每日新增设备信息'
PARTITIONED BY (`dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/dws/dws_new_mid_day'

-- 逻辑实现
INSERT OVERWRITE TABLE dws_new_mid_day PARTITION(dt='2019-02-10')
SELECT
    ud.mid_id,
    ud.user_id,
    ud.version_code,
    ud.version_name,
    ud.lang,
    ud.source,
    ud.os,
    ud.area,
    ud.model,
    ud.brand,
    ud.sdk_version,
    ud.gmail,
    ud.height_width,
    ud.app_time,
    ud.network,
    ud.lng,
    ud.lat,
    '2019-02-10'
FROM 
    dwd_start_log ud
LEFT JOIN
    dws_new_mid_day nm
ON ud.mid_id=nm.mid_id
WHERE 
    ud.dt='2019-02-10'
    AND nm.mid_id IS NULL;

-- 个人认为,这个逻辑,虽然计算正确,但是并不是很合理

ADS层表及逻辑设计

1 每日新增设备报表

-- 建表语句
DROP TABLE IF EXISTS `ads_new_mid_count`;
CREATE TABLE `ads_new_mid_count`
(
    `create_date` STRING COMMENT'创建时间',
    `new_mid_count` BIGINT COMMENT'新增设备数量'
) COOMENT'每日新增设备量'
PARTITIONED BY (`dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/ads/ads_new_mid_count'

-- 逻辑实现
INSERT OVERWRITE TABLE ads_new_mid_count
SELECT
    create_date,
    count(1)
FROM 
    dws_new_mid_day
WHERE create_date='2019-02-10'
GROUP BY create_Date;

需求三

用户留存

定义:

  1. 留存用户:某段时间内的新增用户(活跃用户),经过一段时间后,又继续使用应用的被认作是留存用户
  2. 留存率:留存用户占当时新增用户(活跃用户)的比例即是留存率

例:2月10日新增用户100,这100人在2月11日启动过应用的有30人,2月12日启动过应用的有25人,2月13日启动过应用的有32人,则2月10日新增用户次日留存率是30/100=30%,两日留存率是25/100=25%,三日留存率是32/100=32%

需求:每天计算前1、2、3、4、7、14天的留存率

需求分析:假设今天是11日,要统计前1天也就是10日新增设备的留存率,则计算方式为10日的新增设备且11日活跃的/10日的新增设备,10日的新增设备且11日活跃可使用10日新增设备明细与11日活跃设备关联,得到留存用户,10日新增设备可有每日新增设备明细表直接取出(根据分区)

DWS层表及逻辑设计

1 每日用户留存明细

-- 建表语句
DROP TABLE IF EXISTS `dws_user_retention_day`;
CREATE TABLE `dws_user_retention_day`
(
    `mid_id` STRING COMMENT'设备唯一标识',
    `user_id` STRING COMMENT'用户标识',
    `version_code` STRING COMMENT'程序版本号',
    `version_name` STRING COMMENT'程序版本名',
    `lang` STRING COMMENT'系统语言',
    `source` STRING COMMENT'渠道号',
    `os` STRING COMMENT'安卓系统版本',
    `area` STRING COMMENT'区域',
    `model` STRING COMMENT'手机型号',
    `brand` STRING COMMENT'手机品牌',
    `sdk_version` STRING COMMENT'sdkVersion',
    `gmail` STRING COMMENT'gmail',
    `height_width` STRING COMMENT'屏幕宽高',
    `app_time` STRING COMMENT'客户端日志产生时的时间',
    `network` STRING COMMENT'网络模式',
    `lng` STRING COMMENT'经度',
    `lat` STRING COMMENT'纬度',
    `create_date` STRING COMMENT'设备新增时间',
    `retention_day` INT COMMENT'截止当前日期留存天数'
) COOMENT'每日用户留存情况'
PARTITIONED BY (`dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/dws/dws_user_retention_day'

-- 逻辑实现
INSERT OVERWRITE TABLE dws_user_retention_day PARTITION(dt='2019-02-11')
SELECT
    nm.mid_id,
    nm.user_id,
    nm.version_code,
    nm.version_name,
    nm.lang,
    nm.source,
    nm.os,
    nm.area,
    nm.model,
    nm.brand,
    nm.sdk_version,
    nm.gmail,
    nm.height_width,
    nm.app_time,
    nm.network,
    nm.lng,
    nm.lat,
    nm.create_date,
    1 retention_day
FROM 
    dws_uv_detail_day ud
LEFT JOIN
    dws_new_mid_day nm
ON ud.mid_id=nm.mid_id
WHERE 
    ud.dt='2019-02-11'
    AND nm.create_date=DATE_ADD('2019-02-11',-1);
UNION ALL
SELECT
    nm.mid_id,
    nm.user_id,
    nm.version_code,
    nm.version_name,
    nm.lang,
    nm.source,
    nm.os,
    nm.area,
    nm.model,
    nm.brand,
    nm.sdk_version,
    nm.gmail,
    nm.height_width,
    nm.app_time,
    nm.network,
    nm.lng,
    nm.lat,
    nm.create_date,
    2 retention_day
FROM 
    dws_uv_detail_day ud
LEFT JOIN
    dws_new_mid_day nm
ON ud.mid_id=nm.mid_id
WHERE 
    ud.dt='2019-02-11'
    AND nm.create_date=DATE_ADD('2019-02-11',-2)
...;

ADS层表及逻辑设计

1 留存用户数

-- 建表语句
DROP TABLE IF EXISTS `ads_user_retention_day_count`;
CREATE TABLE `ads_user_retention_day_count`
(
    `create_date` STRING COMMENT'设备新增日期',
    `retention_day` INT COMMENT'截止当前日期留存天数'
    `retention_count` BIGINT COMMENT'留存数量'
) COOMENT'每日新增设备量'
PARTITIONED BY (`dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/ads/ads_user_retention_day_count'

-- 逻辑实现
INSERT INTO TABLE ads_user_retention_day_count
SELECT
    create_date,
    retention_day,
    count(1) retention_count
FROM dws_new_mid_day
WHERE create_date='2019-02-11'
GROUP BY 
    create_date,
    retention_day;

2 留存用户比率

-- 建表语句
DROP TABLE IF EXISTS `ads_user_retention_day_rate`;
CREATE TABLE `ads_user_retention_day_rate`
(
    `stat_date` STRING COMMENT'统计日期',
    `create_date` STRING COMMENT'设备新增日期',
    `retention_day` INT COMMENT'截止当前日期留存天数',
    `retention_count` BIGINT COMMENT'留存数量',
    `new_mid_count` INT COMMENT'当日设备新增数量',
    `retention_ratio` INT COMMENT'留存率',
) COOMENT'每日用户留存情况'
PARTITIONED BY (`dt` STRING)
STORED AS PARQUET
LOCATION '/warehouse/gmall/ads/ads_user_retention_day_rate'

-- 逻辑实现
INSERT INTO TABLE ads_user_retention_day_rate
SELECT
    '2019-02-11',
    ur.create_date,
    ur.retention_day,
    ur.retention_count,
    nc.new_mid_count,
    ur.retention_count/nv.new_mid_count*100
FROM
(
    SELECT
        create_date,
        retention_day,
        count(1) retention_count
    FROM `dws_user_retention_day`
    WHERE dt='2019-02-11'
    GROUP BY
        create_Date,
        retention_day
) ur
JOIN
ads_new_mid_count nc
ON nv.create_date=ur.create_date

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值