1. 用拉链表实现核心交易分析中DIM层商家维表,并实现该拉链表的回滚
ODS层数据准备
因为需要自己构造数据,我们根据已有的ods_trade_shops 创建一个新的表:
create database hw;
drop table if exists hw.ods_trade_shops;
create table hw.ods_trade_shops(
`shopid` int COMMENT '商铺ID',
`userid` int COMMENT '商铺负责人',
`areaid` int COMMENT '区域ID',
`shopname` string COMMENT '商铺名称',
`shoplevel` int COMMENT '商铺等级',
`status` int COMMENT '商铺状态',
`createtime` string COMMENT '创建日期',
`modifytime` string COMMENT '修改日期'
) COMMENT '商家信息表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by ',';
题目要求自己构造数据,根据需求准备三天的数据:
/data/lagoudw/data/hw/shop-2020-06-29.dat
100050,1,100225,WSxxx营超市,1,1,2020-06-28,2020-06-29 13:22:22
100052,2,100236,新鲜xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22
100053,3,100011,华为xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22
100054,4,100159,小米xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22
100055,5,100211,苹果xxx旗舰店,1,1,2020-06-28,2020-06-29 13:22:22
/data/lagoudw/data/hw/shop-2020-06-30.dat
100056,6,100050,OPxxx自营店,1,1,2020-06-28,2020-06-30 13:22:22
100057,7,100311,三只xxx鼠零食,1,1,2020-06-28,2020-06-30 13:22:22
100058,8,100329,良子xxx铺美食,1,1,2020-06-28,2020-06-30 13:22:22
100053,3,100011,华为xxx旗舰店,2,1,2020-06-28,2020-06-30 13:22:22
100055,5,100211,苹果xxx旗舰店,3,1,2020-06-28,2020-06-30 13:22:22
/data/lagoudw/data/hw/shop-2020-07-01.dat
100059,9,100225,乐居xxx日用品,1,1,2020-06-28,2020-07-01 13:22:22
100060,10,100211,同仁xxx大健康,1,1,2020-06-28,2020-07-01 13:22:22
100061,11,100159,家美xxx旗舰店,1,1,2020-06-28,2020-07-01 13:22:22
100053,3,100011,华为xxx旗舰店,2,1,2020-06-28,2020-07-01 13:22:22
100058,8,100329,良子xxx铺美食,3,1,2020-06-28,2020-07-01 13:22:22
#分别导入表中:
load data local inpath '/data/lagoudw/data/hw/shop-2020-06-29.dat' into table hw.ods_trade_shops
partition(dt='2020-06-29');
load data local inpath '/data/lagoudw/data/hw/shop-2020-06-30.dat' into table hw.ods_trade_shops
partition(dt='2020-06-30');
load data local inpath '/data/lagoudw/data/hw/shop-2020-07-01.dat' into table hw.ods_trade_shops
partition(dt='2020-07-01');
ODS 层数据加载完毕
DIM层数据准备
-- 同样在hw数据库里创建商家信息表, 即拉链表。
drop table if exists hw.dim_trade_shops;
create table hw.dim_trade_shops(
`shopid` int COMMENT '商铺ID',
`userid` int COMMENT '商铺负责人',
`areaid` int COMMENT '区域ID',
`shopname` string COMMENT '商铺名称',
`shoplevel` int COMMENT '商铺等级',
`status` int COMMENT '商铺状态',
`createtime` string COMMENT '创建日期',
`modifytime` string COMMENT '修改日期',
`startdate` string COMMENT '生效起始日期',
`enddate` string COMMENT '失效结束日期'
) COMMENT '商家信息表';
-- 初始化拉链表(加载6.29号的数据):
insert overwrite table hw.dim_trade_shops
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime,
case when modifytime is not null
then from_unixtime(unix_timestamp(modifytime,'yyyy-MM-dd HH:mm:ss'),'yyyy-MM-dd')
else createtime
end as startdate,
'9999-12-31' as enddate
from hw.ods_trade_shops
where dt='2020-06-29';
将加载语句写成脚本:
/data/lagoudw/script/hw/dim_load_trade_shops.sh
#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert overwrite table hw.dim_trade_shops
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime,
case when modifytime is not null
then from_unixtime(unix_timestamp(modifytime,'yyyy-MM-dd HH:mm:ss'),'yyyy-MM-dd')
else createtime
end as startdate,
'9999-12-31' as enddate
from hw.ods_trade_shops
where dt='$do_date'
union all
select A.shopid,
A.userid,
A.areaid,
A.shopname,
A.shoplevel,
A.status,
A.createtime,
A.modifytime,
A.startdate,
case when B.shopid is not null and A.enddate = '9999-12-31'
then date_add('$do_date', -1)
else A.enddate end as enddate
from (select * from hw.ods_trade_shops where dt='$do_date') B
right join hw.dim_trade_shops A on B.shopid = A.shopid;
"
hive -e "$sql"
加载6.30 和7.1的数据:
sh /data/lagoudw/script/hw/dim_load_trade_shops.sh 2020-06-30
sh /data/lagoudw/script/hw/dim_load_trade_shops.sh 2020-07-01
检查数据:
拉链表回滚
/data/lagoudw/script/hw/dim_rollback_trade_shops.sh
#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
drop table hw.tmp;
create table hw.tmp as
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime, startdate, enddate
from hw.dim_trade_shops
where enddate < '$do_date'
union all
select shopid, userid, areaid, shopname, shoplevel, status, createtime, modifytime, startdate, '9999-12-31' as enddate
from hw.dim_trade_shops
where startdate <= '$do_date' and enddate >= '$do_date';
"
hive -e "$sql"
测试回滚6.29:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qZpxyLXO-1639935559016)(C:\Users\jane\AppData\Roaming\Typora\typora-user-images\image-20211219204744428.png)]
2. 在会员分析中计算沉默会员数和流失会员数
沉默会员的定义:只在安装当天启动过App,而且安装时间是在7天前
流失会员的定义:最近30天未登录的会员
首先我将表 dws.dws_member_start_day 截取了一小部分的data,在我的hw数据库中重新创建,修改一些符合标准的数据
use hw;
drop table if exists hw.dws_member_start_day;
create table hw.dws_member_start_day
(
`device_id` string,
`uid` string,
`app_v` string,
`os_type` string,
`language` string,
`channel` string,
`area` string,
`brand` string
) COMMENT '会员日启动汇总'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by ',';
准备数据:
/data/lagoudw/data/hw/member-start-day-2020-06-29.dat
1FB872-9A1001,2F10092A1,1.1.0,0.97,chinese WM,三门峡,xiaomi-3,2020-06-29
1FB872-9A10010,2F10092A10,1.1.5,0.60,chinese ZD,江门,Huawei-9,2020-06-29
1FB872-9A100100,2F10092A100,1.1.10,7.2,chinese WK,金坛,xiaomi-6,2020-06-29
/data/lagoudw/data/hw/member-start-day-2020-07-02.dat
1FB872-9A1001,2F10092A1,1.1.0,0.97,chinese WM,三门峡,xiaomi-3,2020-07-02
1FB872-9A100100,2F10092A100,1.1.10,7.2,chinese WK,金坛,xiaomi-6,2020-07-02
1FB872-9A1001000,2F10092A1000,1.1.3,8.1,chinese UJ,唐山,xiaomi-0,2020-07-02
1FB872-9A1001001,2F10092A1001,1.1.10,8.3.5,chinese LB,乳山,xiaomi-7,2020-07-02
1FB872-9A1001002,2F10092A1002,1.1.9,0.0.0,chinese MZ,温州,xiaomi-0,2020-07-02
1FB872-9A1001003,2F10092A1003,1.1.14,3.20,chinese SL,安阳,Huawei-1,2020-07-02
/data/lagoudw/data/hw/member-start-day-2020-07-08.dat
1FB872-9A1001002,2F10092A1002,1.1.9,0.0.0,chinese MZ,温州,xiaomi-0,2020-07-08
1FB872-9A1001003,2F10092A1003,1.1.14,3.20,chinese SL,安阳,Huawei-1,2020-07-08
/data/lagoudw/data/hw/member-start-day-2020-08-05.dat
1FB872-9A1001,2F10092A1,1.1.0,0.97,chinese WM,三门峡,xiaomi-3,2020-08-05
1FB872-9A1001002,2F10092A1002,1.1.9,0.0.0,chinese MZ,温州,xiaomi-0,2020-08-05
1FB872-9A1001003,2F10092A1003,1.1.14,3.20,chinese SL,安阳,Huawei-1,2020-08-05
1FB872-9A1001004,2F10092A1004,1.1.9,9.9, chinese YX,威海,Huawei-2,2020-08-05
#导入数据:
load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-06-29.dat' into table hw.dws_member_start_day
partition(dt='2020-06-29');
load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-07-02.dat' into table hw.dws_member_start_day
partition(dt='2020-07-02');
load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-07-08.dat' into table hw.dws_member_start_day
partition(dt='2020-07-08');
load data local inpath '/data/lagoudw/data/hw/member-start-day-2020-08-05.dat' into table hw.dws_member_start_day
partition(dt='2020-08-05');
计算沉默会员:
假定今日是2020-07-08, 筛选出的会员将只在2020-06-29登录,即只登录了一次
#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
drop table hw.silent_cnt;
create table hw.silent_cnt as
-- 先选出七天前登陆的用户
select count(*) as silent_member, '$do_date' as dt
from
(select device_id from hw.dws_member_start_day
where dt<date_add('$do_date', -7)) A join
-- 只登陆的一次的用户
(select device_id, count(device_id) as cnt from hw.dws_member_start_day group by device_id having cnt=1) B on A.device_id = B.device_id
"
hive -e "$sql"
计算流失会员:
假定今日是2020-08-05,筛选出的会员30天前登陆过但是近30天内无登陆记录:
#!/bin/bash
source /etc/profile
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
drop table hw.lost_cnt;
create table hw.lost_cnt as
select count(distinct device_id) as lost_member, '$do_date'
from hw.dws_member_start_day
where dt<=date_add('$do_date', -30)
and device_id not in (
select device_id
from hw.dws_member_start_day
where dt>date_add('$do_date', -30) and dt<='$do_date'
)
"
hive --hiveconf "hive.strict.checks.cartesian.product=false" -e "$sql"
3. 在核心交易分析中完成如下指标的计算
-- 1.1 先统计统计每日订单笔数和销售总额,只统计有效订单:
drop table if exists dws.dws_trade_orders_day;
create table if not exists dws.dws_trade_orders_day(
day_dt string comment '日期:yyyy-MM-dd',
day_cnt decimal comment '日订单笔数',
day_sum decimal comment '日订单总额'
) comment '日订单统计表';
insert overwrite table dws.dws_trade_orders_day
select dt, count(*) cnt, sum(totalmoney) sum_money
from (
select distinct orderid, dt, totalmoney from dwd.dwd_trade_orders where dataFlag='1'
) tmp
group by dt;
-- 1.2 尝试测试:
select * from dws.dws_trade_orders_day where dt between '2020-01-01' and '2020-12-31';
-- 2 统计2020年每个季度的销售订单笔数、订单总额
drop table if exists dws.dws_trade_orders_season;
create table if not exists dws.dws_trade_orders_season(
year string comment '年份',
season_dt string comment '季度:S1, S2, S3, S4',
season_cnt decimal comment '季度订单笔数',
season_sum decimal comment '季度订单总额'
) comment '季度订单统计表';
with tmp as(
select substr(day_dt, 0, 4) as year,
case when substr(day_dt, 6, 2)='01' or substr(day_dt, 6, 2)='02' or substr(day_dt, 6, 2)='03' then "1"
when substr(day_dt, 6, 2)='04' or substr(day_dt, 6, 2)='05' or substr(day_dt, 6, 2)='06' then "2"
when substr(day_dt, 6, 2)='07' or substr(day_dt, 6, 2)='08' or substr(day_dt, 6, 2)='09' then "3"
when substr(day_dt, 6, 2)='10' or substr(day_dt, 6, 2)='11' or substr(day_dt, 6, 2)='12' then "4" end as season_dt,
day_cnt,
day_sum
from dws.dws_trade_orders_day
)
insert overwrite table dws.dws_trade_orders_season
select year,
season_dt,
sum(day_cnt),
sum(day_sum)
from tmp
group by year, season_dt;
-- 3. 统计2020年每个月的销售订单笔数、订单总额
drop table if exists dws.dws_trade_orders_month;
create table if not exists dws.dws_trade_orders_month(
year string comment '年份',
month_dt string comment '月份',
month_cnt decimal comment '月份订单笔数',
month_sum decimal comment '月份订单总额'
) comment '月份订单统计表';
with tmp as(
select substr(day_dt, 0, 4) as year,
substr(day_dt, 6, 2) as month_dt,
day_cnt,
day_sum
from dws.dws_trade_orders_day
)
insert overwrite table dws.dws_trade_orders_month
select year,
month_dt,
sum(day_cnt),
sum(day_sum)
from tmp
group by year, month_dt;
– 统计2020年每周(周一到周日)的销售订单笔数、订单总额
使用udf函数得到对应日期的周数:
package cn.lagou.dw.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import java.text.ParseException;
import java.util.Calendar;
import java.util.Date;
import java.text.SimpleDateFormat;
public class WeekOfYear extends UDF {
public int evaluate(final Text val) throws ParseException {
String datestr = val.toString();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date dt = sdf.parse(datestr);
Calendar c1 = Calendar.getInstance();
c1.setFirstDayOfWeek(Calendar.MONDAY);
c1.setTime(dt);
int dayOfWeek = c1.get(Calendar.WEEK_OF_YEAR);
return dayOfWeek;
}
}
接下来使用此函数进行周表计算:
drop table if exists dws.dws_trade_orders_week;
create table if not exists dws.dws_trade_orders_week(
year string comment '年份',
month_dt string comment '周数',
month_cnt decimal comment '周份订单笔数',
month_sum decimal comment '周份订单总额'
) comment '周份订单统计表';
with tmp as(
select substr(day_dt, 0, 4) as year,
weekOfYear(day_dt) as week,
day_cnt,
day_sum
from dws.dws_trade_orders_day
)
insert overwrite table dws.dws_trade_orders_week
select year,
week,
sum(day_cnt),
sum(day_sum)
from tmp group by year, week;
– 统计2020年国家法定节假日、休息日、工作日的订单笔数、订单总额
创建日期维表,用来统计当日是否是节假日or休息日or工作日
drop table if exists dim.day_info;
create table if not exists dim.day_info(
year string comment '年份',
dt string comment '日期',
is_holiday int comment '1: yes; 0: no',
is_workDay int comment '1: yes; 0: no'
) comment '周份订单统计表';
-- 法定假日统计
select A.year, nvl(sum(A.day_cnt),0), nvl(sum(A.day_sum),0) from
dws.dws_trade_orders_day group by year A join
select dt from dim.day_info where is_holiday=1
B on A.dt = B.dt;
-- 休息日统计
select A.year, nvl(sum(A.day_cnt),0), nvl(sum(A.day_sum),0) from
dws.dws_trade_orders_day join
select dt from dim.day_info where is_workDay=0
B on A.dt = B.dt
group by year;
-- 工作日统计
select A.year, nvl(sum(A.day_cnt),0), nvl(sum(A.day_sum),0) from
dws.dws_trade_orders_day join
select dt from dim.day_info where is_workDay=1
B on A.dt = B.dt
group by year;