PB级企业电商离线数仓项目实战(下)-HW

1. 用拉链表实现核心交易分析中DIM层商家维表,并实现该拉链表的回滚

ODS层数据准备

因为需要自己构造数据,我们根据已有的ods_trade_shops 创建一个新的表:

create database hw;
drop table if exists hw.ods_trade_shops;
create table hw.ods_trade_shops(
  `shopid` int COMMENT '商铺ID',
  `userid` int COMMENT '商铺负责人', 
  `areaid` int COMMENT '区域ID',
  `shopname` string COMMENT '商铺名称',
  `shoplevel` int COMMENT '商铺等级',
  `status` int COMMENT '商铺状态',
  `createtime` string COMMENT '创建日期',
  `modifytime` string COMMENT  '修改日期'
) COMMENT '商家信息表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by ',';

题目要求自己构造数据,根据需求准备三天的数据:

/data/lagoudw/data/hw/shop-2020-06-29.dat
100050,1,100225,WSxxx营超市,1,1,2020-06-28,2020-06-29 13:22:22
100052,2,100236,新鲜xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22
100053,3,100011,华为xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22
100054,4,100159,小米xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22
100055,5,100211,苹果xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22

/data/lagoudw/data/hw/shop-2020-06-30.dat
100056,6,100050,OPxxx自营店,1,1,2020-06-28,2020-06-30 13:22:22
100057,7,100311,三只xxx鼠零食,1,1,2020-06-28,2020-06-30 13:22:22
100058,8,100329,良子xxx铺美食,1,1,2020-06-28,2020-06-30 13:22:22
100053,3,100011,华为xxx旗舰店,2,1,2020-06-28,2020-06-30 13:22:22
100055,5,100211,苹果xxx旗舰店,3,1,2020-06-28,2020-06-30 13:22:22

/data/lagoudw/data/hw/shop-2020-07-01.dat
100059,9,100225,乐居xxx日用品,1,1,2020-06-28,2020-07-01 13:22:22
100060,10,100211,同仁xxx大健康,1,1,2020-06-28,2020-07-01 13:22:22
100061,11,100159,家美xxx旗舰店,1,1,2020-06-28,2020-07-01 13:22:22
100053,3,100011,华为xxx旗舰店,2,1,2020-06-28,2020-07-01 13:22:22
100058,8,100329,良子xxx铺美食,3,1,2020-06-28,2020-07-01 13:22:22

#分别导入表中:
load data local inpath '/data/lagoudw/data/hw/shop-2020-06-29.dat' into table hw.ods_trade_shops
partition(dt='2020-06-29');

load data local inpath '/data/lagoudw/data/hw/shop-2020-06-30.dat' into table hw.ods_trade_shops
partition(dt='2020-06-30');

load data local inpath '/data/lagoudw/data/hw/shop-2020-07-01.dat' into table hw.ods_trade_shops
partition(dt='2020-07-01');

ODS 层数据加载完毕

DIM层数据准备

-- 同样在hw数据库里创建商家信息表, 即拉链表。
drop table if exists hw.dim_trade_shops;
create table hw.dim_trade_shops(
  `shopid` int COMMENT '商铺ID',
  `userid` int COMMENT '商铺负责人', 
  `areaid` int COMMENT '区域ID',
  `shopname` string COMMENT '商铺名称',
  `shoplevel` int COMMENT '商铺等级',
  `status` int COMMENT '商铺状态',
  `createtime` string COMMENT '创建日期',
  `modifytime` string COMMENT  '修改日期',
  `startdate` string  COMMENT '生效起始日期',
  `enddate` string  COMMENT '失效结束日期'
) COMMENT '商家信息表';

-- 初始化拉链表(加载6.29号的数据):
insert overwrite table hw.dim_trade_shops
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime,
case when modifytime is not null
     then from_unixtime(unix_timestamp(modifytime,'yyyy-MM-dd HH:mm:ss'),'yyyy-MM-dd')
     else createtime
     end as startdate,
    '9999-12-31' as enddate
from hw.ods_trade_shops
where dt='2020-06-29';

将加载语句写成脚本:

/data/lagoudw/script/hw/dim_load_trade_shops.sh

#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert overwrite table hw.dim_trade_shops
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime,
case when modifytime is not null
     then from_unixtime(unix_timestamp(modifytime,'yyyy-MM-dd HH:mm:ss'),'yyyy-MM-dd')
     else createtime
     end as startdate,
    '9999-12-31' as enddate
from hw.ods_trade_shops
where dt='$do_date'

union all

select A.shopid, 
       A.userid,
       A.areaid,
       A.shopname,
       A.shoplevel,
       A.status,
       A.createtime,
       A.modifytime,
       A.startdate,
       case when B.shopid is not null and A.enddate = '9999-12-31'
       then date_add('$do_date', -1)
       else A.enddate end as enddate
       from (select * from hw.ods_trade_shops where dt='$do_date') B
       right join hw.dim_trade_shops A on B.shopid = A.shopid; 
"
hive -e "$sql"

加载6.30 和7.1的数据:

sh /data/lagoudw/script/hw/dim_load_trade_shops.sh 2020-06-30
sh /data/lagoudw/script/hw/dim_load_trade_shops.sh 2020-07-01

检查数据:
在这里插入图片描述

拉链表回滚

/data/lagoudw/script/hw/dim_rollback_trade_shops.sh

#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
drop table hw.tmp;
create table hw.tmp as
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime, startdate, enddate
from hw.dim_trade_shops
where enddate < '$do_date'
union all
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime, startdate, '9999-12-31' as enddate
from hw.dim_trade_shops
where startdate <= '$do_date' and enddate >= '$do_date';
"
hive -e "$sql"

测试回滚6.29:

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qZpxyLXO-1639935559016)(C:\Users\jane\AppData\Roaming\Typora\typora-user-images\image-20211219204744428.png)]

2. 在会员分析中计算沉默会员数和流失会员数

沉默会员的定义:只在安装当天启动过App,而且安装时间是在7天前

流失会员的定义:最近30天未登录的会员

首先我将表 dws.dws_member_start_day 截取了一小部分的data,在我的hw数据库中重新创建,修改一些符合标准的数据

use hw;
drop table if exists hw.dws_member_start_day;
create table hw.dws_member_start_day
(
`device_id` string,
`uid` string,
`app_v` string,
`os_type` string,
`language` string,
`channel` string,
`area` string,
`brand` string
) COMMENT '会员日启动汇总'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by ',';

准备数据:

/data/lagoudw/data/hw/member-start-day-2020-06-29.dat
1FB872-9A1001,2F10092A1,1.1.0,0.97,chinese WM,三门峡,xiaomi-3,2020-06-29
1FB872-9A10010,2F10092A10,1.1.5,0.60,chinese ZD,江门,Huawei-9,2020-06-29
1FB872-9A100100,2F10092A100,1.1.10,7.2,chinese WK,金坛,xiaomi-6,2020-06-29

/data/lagoudw/data/hw/member-start-day-2020-07-02.dat
1FB872-9A1001,2F10092A1,1.1.0,0.97,chinese WM,三门峡,xiaomi-3,2020-07-02
1FB872-9A100100,2F10092A100,1.1.10,7.2,chinese WK,金坛,xiaomi-6,2020-07-02
1FB872-9A1001000,2F10092A1000,1.1.3,8.1,chinese UJ,唐山,xiaomi-0,2020-07-02
1FB872-9A1001001,2F10092A1001,1.1.10,8.3.5,chinese LB,乳山,xiaomi-7,2020-07-02
1FB872-9A1001002,2F10092A1002,1.1.9,0.0.0,chinese MZ,温州,xiaomi-0,2020-07-02
1FB872-9A1001003,2F10092A1003,1.1.14,3.20,chinese SL,安阳,Huawei-1,2020-07-02

/data/lagoudw/data/hw/member-start-day-2020-07-08.dat
1FB872-9A1001002,2F10092A1002,1.1.9,0.0.0,chinese MZ,温州,xiaomi-0,2020-07-08
1FB872-9A1001003,2F10092A1003,1.1.14,3.20,chinese SL,安阳,Huawei-1,2020-07-08

/data/lagoudw/data/hw/member-start-day-2020-08-05.dat
1FB872-9A1001,2F10092A1,1.1.0,0.97,chinese WM,三门峡,xiaomi-3,2020-08-05
1FB872-9A1001002,2F10092A1002,1.1.9,0.0.0,chinese MZ,温州,xiaomi-0,2020-08-05
1FB872-9A1001003,2F10092A1003,1.1.14,3.20,chinese SL,安阳,Huawei-1,2020-08-05
1FB872-9A1001004,2F10092A1004,1.1.9,9.9, chinese YX,威海,Huawei-2,2020-08-05

#导入数据:
load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-06-29.dat' into table hw.dws_member_start_day
partition(dt='2020-06-29');

load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-07-02.dat' into table hw.dws_member_start_day
partition(dt='2020-07-02');

load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-07-08.dat' into table hw.dws_member_start_day
partition(dt='2020-07-08');

load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-08-05.dat' into table hw.dws_member_start_day
partition(dt='2020-08-05');

计算沉默会员:

假定今日是2020-07-08, 筛选出的会员将只在2020-06-29登录,即只登录了一次

#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
drop table hw.silent_cnt;
create table hw.silent_cnt as
-- 先选出七天前登陆的用户
select count(*) as silent_member, '$do_date' as dt
from 
(select device_id from hw.dws_member_start_day
where dt<date_add('$do_date', -7)) A join
-- 只登陆的一次的用户
(select device_id, count(device_id) as cnt from hw.dws_member_start_day group by device_id having cnt=1) B on A.device_id = B.device_id
"
hive -e "$sql"

计算流失会员:

假定今日是2020-08-05,筛选出的会员30天前登陆过但是近30天内无登陆记录:

#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
drop table hw.lost_cnt;
create table hw.lost_cnt as
select count(distinct device_id) as lost_member, '$do_date' 
from hw.dws_member_start_day
where dt<=date_add('$do_date', -30)
and device_id not in (
select device_id
from hw.dws_member_start_day
where dt>date_add('$do_date', -30) and dt<='$do_date' 
)

"
hive --hiveconf "hive.strict.checks.cartesian.product=false" -e "$sql"

3. 在核心交易分析中完成如下指标的计算

-- 1.1 先统计统计每日订单笔数和销售总额,只统计有效订单:
drop table if exists dws.dws_trade_orders_day;
create table if not exists dws.dws_trade_orders_day(
  day_dt string comment '日期:yyyy-MM-dd',
  day_cnt decimal comment '日订单笔数',
  day_sum decimal comment '日订单总额'
) comment '日订单统计表';

insert overwrite table dws.dws_trade_orders_day
select dt, count(*) cnt, sum(totalmoney) sum_money
from (
select distinct orderid, dt, totalmoney from dwd.dwd_trade_orders where dataFlag='1'
) tmp
group by dt;

-- 1.2 尝试测试:
select * from dws.dws_trade_orders_day where dt between '2020-01-01' and '2020-12-31';

-- 2 统计2020年每个季度的销售订单笔数、订单总额
drop table if exists dws.dws_trade_orders_season;
create table if not exists dws.dws_trade_orders_season(
  year string comment '年份',
  season_dt string comment '季度:S1, S2, S3, S4',
  season_cnt decimal comment '季度订单笔数',
  season_sum decimal comment '季度订单总额'
) comment '季度订单统计表';

with tmp as(
select substr(day_dt, 0, 4) as year,
    case when substr(day_dt, 6, 2)='01' or substr(day_dt, 6, 2)='02' or substr(day_dt, 6, 2)='03' then "1"
     when substr(day_dt, 6, 2)='04' or substr(day_dt, 6, 2)='05' or substr(day_dt, 6, 2)='06' then "2"
     when substr(day_dt, 6, 2)='07' or substr(day_dt, 6, 2)='08' or substr(day_dt, 6, 2)='09' then "3"
     when substr(day_dt, 6, 2)='10' or substr(day_dt, 6, 2)='11' or substr(day_dt, 6, 2)='12' then "4" end as season_dt,
     day_cnt,
     day_sum
     from dws.dws_trade_orders_day
)
insert overwrite table dws.dws_trade_orders_season
select year,
       season_dt,
       sum(day_cnt),
       sum(day_sum)
from tmp
group by year, season_dt;

-- 3. 统计2020年每个月的销售订单笔数、订单总额
drop table if exists dws.dws_trade_orders_month;
create table if not exists dws.dws_trade_orders_month(
  year string comment '年份',
  month_dt string comment '月份',
  month_cnt decimal comment '月份订单笔数',
  month_sum decimal comment '月份订单总额'
) comment '月份订单统计表';

with tmp as(
select substr(day_dt, 0, 4) as year,
       substr(day_dt, 6, 2) as month_dt,
       day_cnt,
       day_sum
from dws.dws_trade_orders_day
)
insert overwrite table dws.dws_trade_orders_month
select year,
       month_dt,
       sum(day_cnt),
       sum(day_sum)
from tmp
group by year, month_dt;

– 统计2020年每周(周一到周日)的销售订单笔数、订单总额

使用udf函数得到对应日期的周数:

package cn.lagou.dw.hive.udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import java.text.ParseException;
import java.util.Calendar;
import java.util.Date;
import java.text.SimpleDateFormat;

public class WeekOfYear extends UDF {

    public int evaluate(final Text val) throws ParseException {
        String datestr = val.toString();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        Date dt = sdf.parse(datestr);
        Calendar c1 = Calendar.getInstance();
        c1.setFirstDayOfWeek(Calendar.MONDAY);
        c1.setTime(dt);
        int dayOfWeek = c1.get(Calendar.WEEK_OF_YEAR);
        return dayOfWeek;
    }
    
}

接下来使用此函数进行周表计算:

drop table if exists dws.dws_trade_orders_week;
create table if not exists dws.dws_trade_orders_week(
  year string comment '年份',
  month_dt string comment '周数',
  month_cnt decimal comment '周份订单笔数',
  month_sum decimal comment '周份订单总额'
) comment '周份订单统计表';

with tmp as(
select substr(day_dt, 0, 4) as year,
       weekOfYear(day_dt) as week,
       day_cnt,
       day_sum
from dws.dws_trade_orders_day
)
insert overwrite table dws.dws_trade_orders_week
select year,
       week,
       sum(day_cnt),
       sum(day_sum)
from tmp group by year, week;

– 统计2020年国家法定节假日、休息日、工作日的订单笔数、订单总额

创建日期维表,用来统计当日是否是节假日or休息日or工作日

drop table if exists dim.day_info;
create table if not exists dim.day_info(
  year string comment '年份',
  dt string comment '日期',
  is_holiday int comment '1: yes; 0: no',
  is_workDay int comment '1: yes; 0: no'
) comment '周份订单统计表';

-- 法定假日统计
select A.year, nvl(sum(A.day_cnt),0), nvl(sum(A.day_sum),0) from
dws.dws_trade_orders_day group by year A join
select dt from dim.day_info where is_holiday=1
B on A.dt = B.dt;

-- 休息日统计
select A.year, nvl(sum(A.day_cnt),0), nvl(sum(A.day_sum),0) from
dws.dws_trade_orders_day join
select dt from dim.day_info where is_workDay=0
B on A.dt = B.dt
group by year;

-- 工作日统计
select A.year, nvl(sum(A.day_cnt),0), nvl(sum(A.day_sum),0) from
dws.dws_trade_orders_day join
select dt from dim.day_info where is_workDay=1
B on A.dt = B.dt
group by year;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值