1 活跃用户
1.1 DWS层
目标:统计当日、当周、当月活跃用户
活跃用户指的是在统计周期内由过通话记录的用户
1.1.1 每日活跃用户明细
1)建表语句
hive (calllogs)>
drop table if exists dws_au_detail_day;
create table dws_au_detail_day(
sys_id string,
service_name string,
home_province_code string,
visit_province_code string,
channel_code string,
service_code string,
imsi string,
msisdn string
) COMMENT '活跃用户按天明细'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/calllogs/dws/dws_au_detail_day/';
2)数据导入
以用户手机号为key进行聚合
hive (calllogs)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_au_detail_day partition(dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
'2021-12-07'
from dwd_calllogs_cdr
where dt='2021-12-07'
group by msisdn;
3)查询导入结果
hive (calllogs)> select * from dws_au_detail_day limit 5;
hive (calllogs)> select count(*) from dws_au_detail_day;
1.1.2 每周活跃用户明细
1)建表语句
hive (calllogs)>
drop table if exists dws_au_detail_week;
create table dws_au_detail_week(
sys_id string,
service_name string,
home_province_code string,
visit_province_code string,
channel_code string,
service_code string,
imsi string,
msisdn string,
monday_date string,
sunday_date string
) COMMENT '活跃用户按周明细'
PARTITIONED BY ( `week_dt` string)
stored as parquet
location '/warehouse/calllogs/dws/dws_au_detail_week/';
2)数据导入
以用户手机号为key按周进行聚合
hive (calllogs)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_au_detail_week partition(week_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_add(next_day('2021-12-07','MO'),-7),
date_add(next_day('2021-12-07','MO'),-1),
concat(date_add(next_day('2021-12-07','MO'),-7),'-',date_add(next_day('2021-12-07','MO'),-1))
from dwd_calllogs_cdr
where dt >= date_add(next_day('2021-12-07','MO'),-7) and dt <= date_add(next_day('2021-12-07','MO'),-1)
group by msisdn;
3)查询导入结果
hive (calllogs)> select * from dws_au_detail_week limit 5;
hive (calllogs)> select count(*) from dws_au_detail_week;
1.1.3 每月活跃用户明细
1)建表语句
hive (calllogs)>
drop table if exists dws_au_detail_month;
create table dws_au_detail_month(
sys_id string,
service_name string,
home_province_code string,
visit_province_code string,
channel_code string,
service_code string,
imsi string,
msisdn string
) COMMENT '活跃用户按月明细'
PARTITIONED BY ( `month_dt` string)
stored as parquet
location '/warehouse/calllogs/dws/dws_au_detail_month/';
2)数据导入
以用户手机号为key按周进行聚合
hive (calllogs)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_au_detail_month partition(month_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_format('2021-12-07','yyyy-MM')
from dwd_calllogs_cdr
where date_format(dt,'yyyy-MM') = date_format('2021-12-07','yyyy-MM')
group by msisdn;
3)查询导入结果
hive (calllogs)> select * from dws_au_detail_month limit 5;
hive (calllogs)> select count(*) from dws_au_detail_month;
1.1.4 DWS层活跃用户加载数据脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs_dws_au.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$DB".dws_au_detail_day partition(dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
'$calllogs_date'
from "$DB".dwd_calllogs_cdr
where dt='$calllogs_date'
group by msisdn;
insert overwrite table "$DB".dws_au_detail_week partition(week_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_add(next_day('$calllogs_date','MO'),-7),
date_add(next_day('$calllogs_date','MO'),-1),
concat(date_add(next_day('$calllogs_date','MO'),-7),'_',date_add(next_day('$calllogs_date','MO'),-1))
from "$DB".dwd_calllogs_cdr
where dt >= date_add(next_day('$calllogs_date','MO'),-7) and dt <= date_add(next_day('$calllogs_date','MO'),-1)
group by msisdn;
insert overwrite table "$DB".dws_au_detail_month partition(month_dt)
select
collect_set(sys_id)[0] sys_id,
collect_set(service_name)[0] service_name,
collect_set(home_province_code)[0] home_province_code,
collect_set(visit_province_code)[0] visit_province_code,
collect_set(channel_code)[0] channel_code,
collect_set(service_code)[0] service_code,
collect_set(imsi)[0] imsi,
msisdn,
date_format('$calllogs_date','yyyy-MM')
from "$DB".dwd_calllogs_cdr
where date_format(dt,'yyyy-MM') = date_format('$calllogs_date','yyyy-MM')
group by msisdn;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs_dws_au.sh
3)脚本使用
[hadoop@hadoop101 modules]$ calllogs_dws_au.sh 2021-12-08
4)查询结果
hive (calllogs)> select count(*) from dws_au_detail_day;
hive (calllogs)> select count(*) from dws_au_detail_week;
hive (calllogs)> select count(*) from dws_au_detail_month ;
5)脚本执行时间
企业开发中一般在每日凌晨30分~1点
1.2 ADS层
1.2.1 活跃用户数
目标:当日、当周、当月活跃设备数
1)建表语句
hive (calllogs)>
drop table if exists ads_au_count;
create external table ads_au_count(
`dt` string COMMENT '统计日期',
`day_count` bigint COMMENT '当日用户数量',
`wk_count` bigint COMMENT '当周用户数量',
`mn_count` bigint COMMENT '当月用户数量',
`is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
`is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '每日活跃用户数量'
stored as parquet
location '/warehouse/calllogs/ads/ads_au_count/';
2)导入数据
hive (calllogs)>
insert overwrite table ads_au_count
select
'2021-12-07' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('2021-12-07','MO'),-1)='2021-12-07','Y','N') ,
if(last_day('2021-12-07')='2021-12-07','Y','N')
from
(
select
'2021-12-07' dt,
count(*) ct
from dws_au_detail_day
where dt='2021-12-07'
)daycount join
(
select
'2021-12-07' dt,
count (*) ct
from dws_au_detail_week
where week_dt=concat(date_add(next_day('2021-12-07','MO'),-7),'_' ,date_add(next_day('2021-12-07','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
select
'2021-12-07' dt,
count (*) ct
from dws_au_detail_month
where month_dt=date_format('2021-12-07','yyyy-MM')
)mncount on daycount.dt=mncount.dt;
3)查询导入结果
hive (calllogs)> select * from ads_au_count;
1.2.1 活跃用户数ADS层加载数据脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs_ads_au.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$DB".ads_au_count
select
'$calllogs_date' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('$calllogs_date','MO'),-1)='$calllogs_date','Y','N') ,
if(last_day('$calllogs_date')='$calllogs_date','Y','N')
from
(
select
'$calllogs_date' dt,
count(*) ct
from "$DB".dws_au_detail_day
where dt='$calllogs_date'
)daycount join
(
select
'$calllogs_date' dt,
count (*) ct
from "$DB".dws_au_detail_week
where week_dt=concat(date_add(next_day('$calllogs_date','MO'),-7),'_' ,date_add(next_day('$calllogs_date','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
select
'$calllogs_date' dt,
count (*) ct
from "$DB".dws_au_detail_month
where month_dt=date_format('$calllogs_date','yyyy-MM')
)mncount on daycount.dt=mncount.dt;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs_ads_au.sh
3)脚本使用
[hadoop@hadoop101 modules]$ calllogs_ads_au.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select * from ads_au_count ;
2 新增用户
新增用户包括日新增用户、周新增用户、月新增用户。
2.1 DWS层
2.1.1每日新增用户明细表
1)建表语句
hive (calllogs)>
drop table if exists `dws_new_user_day`;
create table `dws_new_user_day`
(
sys_id string,
service_name string,
home_province_code string,
visit_province_code string,
channel_code string,
service_code string,
imsi string,
msisdn string,
create_date string
) COMMENT '每日新增用户信息'
stored as parquet
location '/warehouse/calllogs/dws/dws_new_user_day/';
2)导入数据
用每日活跃用户表 left join 每日新增设备表,关联的条件是msisdn相等。如果是每日新增的用户,则在每日新增用户表中为null。
hive (calllogs)>
insert into table dws_new_user_day
select
aud.sys_id,
aud.service_name,
aud.home_province_code,
aud.visit_province_code,
aud.channel_code,
aud.service_code,
aud.imsi,
aud.msisdn,
'2021-12-07'
from dws_au_detail_day aud left join dws_new_user_day nud on aud.msisdn=nud.msisdn
where aud.dt='2021-12-07' and nud.msisdn is null;
3)查询导入数据
hive (calllogs)> select count(*) from dws_new_user_day ;
2.1.2 DWS层每日新增用户数据加载脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs.dws_nu.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$DB".dws_new_user_day
select
aud.sys_id,
aud.service_name,
aud.home_province_code,
aud.visit_province_code,
aud.channel_code,
aud.service_code,
aud.imsi,
aud.msisdn,
'$calllogs_date'
from "$DB".dws_au_detail_day aud left join "$DB".dws_new_user_day nud on aud.msisdn=nud.msisdn
where aud.dt='$calllogs_date' and nud.msisdn is null;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs.dws_nu.sh
3)脚本使用
[hadoop@hadoop101 modules]$ calllogs.dws_nu.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select * from dws_new_user_day ;
2.2 ADS层
2.2.1每日新增用户表
1)建表语句
hive (calllogs)>
drop table if exists `ads_new_user_count`;
create table `ads_new_user_count`
(
`create_date` string comment '创建时间' ,
`new_user_count` BIGINT comment '新增用户数量'
) COMMENT '每日新增用户数'
row format delimited fields terminated by '\t'
location '/warehouse/calllogs/ads/ads_new_user_count/';
2)导入数据
hive (calllogs)>
insert into table ads_new_user_count
select create_date, count(*) from dws_new_user_day
where create_date='2021-12-07'
group by create_date ;
3)查询导入数据
hive (calllogs)> select * from ads_new_user_count;
2.2.2每日新增用户表加载数据脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs.ads_nu.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$DB".ads_new_user_count
select create_date, count(*) from "$DB".dws_new_user_day
where create_date='$calllogs_date'
group by create_date ;
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs.ads_nu.sh
3)脚本使用
[hadoop@hadoop101 bin]$ calllogs.ads_nu.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select * from ads_new_user_count;
3 留存用户
3.1 用户留存概念
留存用户:某段时间内的新增用户(活跃用户),经过一段时间后,又继续使用应用的被认作是留存用户;
留存率:留存用户占当时新增用户(活跃用户)的比例即是留存率。
例如,2月10日新增用户100,这100人在2月11日启动过应用的有30人,2月12日启动过应用的有25人,2月13日启动过应用的有32人;
则2月10日新增用户次日的留存率是30/100 = 30%,两日留存率是25/100=25%,三日留存率是32/100=32%。
3.2 用户留存分析
分析:假设今天是11日,要统计前1天也就是10日新增用户的留存率,公式如下:
10日新增用户的留存率= 10日的新增用户且11日活跃的 / 10日的新增用户
1)分母获取
(1)创建每日新增用户明细表
(2)通过每日新增用户明细表获取10日新增用户
2)分子获取
(1)10日新增的用户明细 join 11日活跃用户明细表=每日留存用户表
(2)对每日留存用户表count就得到了10日新增用户在11日的留存用户
3)留存率计算
(1) 10日的新增用户 且 11日活跃的用户表 与10日新增用户join,算出留存用户
3.3 DWS层
3.3.1 每日留存用户明细表
1)建表语句
hive (calllogs)>
drop table if exists `dws_user_retention_day`;
create table `dws_user_retention_day`
(
sys_id string,
service_name string,
home_province_code string,
visit_province_code string,
channel_code string,
service_code string,
imsi string,
msisdn string,
`create_date` string comment '设备新增时间',
`retention_day` int comment '截止当前日期留存天数'
) COMMENT '每日用户留存情况'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/calllogs/dws/dws_user_retention_day/';
2)导入数据(每天计算前1天的新用户访问留存明细)
hive (calllogs)>
insert overwrite table dws_user_retention_day partition(dt="2021-12-07")
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
1 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-1);
3)查询导入数据(每天计算前1天的新用户访问留存明细)
hive (calllogs)> select count(*) from dws_user_retention_day;
3.3.2 1,2,3,n天留存用户明细表
1)导入数据(每天计算前1,2,3,n天的新用户访问留存明细)
hive (calllogs)>
insert overwrite table dws_user_retention_day partition(dt="2021-12-07")
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
1 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-1)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
2 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-2)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
3 retention_day
from dws_au_detail_day aud join dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='2021-12-07' and nud.create_date=date_add('2021-12-07',-3);
2)查询导入数据(每天计算前1,2,3天的新用户访问留存明细)
hive (calllogs)> select retention_day, count(*) from dws_user_retention_day group by retention_day;
3.3.3 1,2,3,n天留存用户明细数据加载脚本
1)在hadoop101的/home/hadoop/bin目录下创建脚本
[hadoop@hadoop101 bin]$ vim calllogs_dws_user_retention.sh
在脚本中编写如下内容
#!/bin/bash
# 定义变量方便修改
DB=calllogs
hive=/opt/modules/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
calllogs_date=$1
else
calllogs_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$DB".dws_user_retention_day partition(dt='$calllogs_date')
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
1 retention_day
from "$DB".dws_au_detail_day aud join "$DB".dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='$calllogs_date' and nud.create_date=date_add('$calllogs_date',-1)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
2 retention_day
from "$DB".dws_au_detail_day aud join "$DB".dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='$calllogs_date' and nud.create_date=date_add('$calllogs_date',-2)
union all
select
nud.sys_id,
nud.service_name,
nud.home_province_code,
nud.visit_province_code,
nud.channel_code,
nud.service_code,
nud.imsi,
nud.msisdn,
nud.create_date,
3 retention_day
from "$DB".dws_au_detail_day aud join "$DB".dws_new_user_day nud on aud.msisdn =nud.msisdn
where aud.dt='$calllogs_date' and nud.create_date=date_add('$calllogs_date',-3);
"
$hive -e "$sql"
2)增加脚本执行权限
[hadoop@hadoop101 bin]$ chmod +x calllogs_dws_user_retention.sh
3)脚本使用
[hadoop@hadoop101 bin]$ calllogs_dws_user_retention.sh 2021-12-08
4)脚本执行时间
企业开发中一般在每日凌晨30分~1点
5)查询导入结果
hive (calllogs)> select retention_day, count(*) from dws_user_retention_day group by retention_day;
3.4 ADS层
3.4.1留存用户数
1)建表语句
hive (calllogs)>
drop table if exists `ads_user_retention_day_count`;
create table `ads_user_retention_day_count`
(
`create_date` string comment '用户新增日期',
`retention_day` int comment '截止当前日期留存天数',
retention_count bigint comment '留存数量'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/calllogs/ads/ads_user_retention_day_count/';
2)导入数据
hive (calllogs)>
insert into table ads_user_retention_day_count
select
create_date,
retention_day,
count(*) retention_count
from dws_user_retention_day
where dt='2021-12-07'
group by create_date,retention_day;
3)查询导入数据
hive (calllogs)> select * from ads_user_retention_day_count;
3.4.2 留存用户比率
1)建表语句
hive (calllogs)>
drop table if exists `ads_user_retention_day_rate`;
create table `ads_user_retention_day_rate`
(
`stat_date` string comment '统计日期',
`create_date` string comment '用户新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量',
`new_user_count` string comment '当日用户新增数量',
`retention_ratio` decimal(10,2) comment '留存率'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/calllogs/ads/ads_user_retention_day_count/';
2)导入数据
hive (calllogs)>
insert into table ads_user_retention_day_rate
select
'2021-12-07' ,
ur.create_date,
ur.retention_day,
ur.retention_count ,
nuc.new_user_count,
ur.retention_count/nuc.new_user_count*100
from
(
select
create_date,
retention_day,
count(*) retention_count
from `dws_user_retention_day`
where dt='2021-12-07'
group by create_date,retention_day
) ur join ads_new_user_count nuc on nuc.create_date=ur.create_date;
3)查询导入数据
hive (calllogs)>select * from ads_user_retention_day_rate;