目录
需求一:用户活跃主题
DWS层
目标:统计当日、当周、当月活动的每个设备明细
每日活跃设备明细
1)建表语句
drop table if exists dws_uv_detail_day;
create table dws_uv_detail_day(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按天明细'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_day/'
;
2)数据导入
以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一。
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_day partition(dt)
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat,
'2019-02-10'
from dwd_start_log
where dt='2019-02-10'
group by mid_id;
每周活跃设备明细
根据日用户访问明细 ,获得周用户访问明细
1)建表语句
drop table if exists dws_uv_detail_wk;
create table dws_uv_detail_wk(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`monday_date` string COMMENT '周一日期',
`sunday_date` string COMMENT '周日日期'
) COMMENT '活跃用户按周明细'
PARTITIONED BY (`wk_dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_wk/'
;
2)数据导入
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_wk partition(wk_dt)
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat,
date_add(next_day('2019-02-10','MO'),-7),
date_add(next_day('2019-02-10','MO'),-1),
concat(date_add( next_day('2019-02-10','MO'),-7), '_' , date_add(next_day('2019-02-10','MO'),-1)
)
from dws_uv_detail_day
where dt>=date_add(next_day('2019-02-10','MO'),-7) and dt<=date_add(next_day('2019-02-10','MO'),-1)
group by mid_id;
每月活跃设备明细
1)建表语句
drop table if exists dws_uv_detail_mn;
create external table dws_uv_detail_mn(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
PARTITIONED BY (`mn` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_mn/'
;
2)数据导入
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_mn partition(mn)
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat,
date_format('2019-02-10','yyyy-MM')
from dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('2019-02-10','yyyy-MM')
group by mid_id;
DWS层加载数据脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dws_uv_detail_day partition(dt='$log_date')
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat
from "$APP".dwd_start_log
where dt='$log_date'
group by mid_id;
insert overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat,
date_add(next_day('$log_date','MO'),-7),
date_add(next_day('$log_date','SU'),-7),
concat(date_add( next_day('$log_date','MO'),-7), '_' , date_add(next_day('$log_date','MO'),-1)
)
from "$APP".dws_uv_detail_day
where dt>=date_add(next_day('$log_date','MO'),-7) and dt<=date_add(next_day('$log_date','MO'),-1)
group by mid_id,lang,gmail,app_time,lng,lat;
insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
select
mid_id,
collect_set(user_id)[0] user_id,
collect_set(version_code)[0] version_code,
collect_set(version_name)[0] version_name,
collect_set(lang)[0]lang,
collect_set(source)[0] source,
collect_set(os)[0] os,
collect_set(area)[0] area,
collect_set(model)[0] model,
collect_set(brand)[0] brand,
collect_set(sdk_version)[0] sdk_version,
collect_set(gmail)[0] gmail,
collect_set(height_width)[0] height_width,
collect_set(app_time)[0]app_time,
collect_set(network)[0] network,
collect_set(lng)[0]lng,
collect_set(lat)[0]lat,
date_format('$log_date','yyyy-MM')
from "$APP".dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('$log_date','yyyy-MM')
group by mid_id,lang,gmail,app_time,lng,lat;
"
$hive -e "$sql"
ADS层
目标:当日、当周、当月活跃设备数
活跃设备数
1)建表语句
hive (gmall)>
drop table if exists ads_uv_count;
create external table ads_uv_count(
`dt` string COMMENT '统计日期',
`day_count` bigint COMMENT '当日用户数量',
`wk_count` bigint COMMENT '当周用户数量',
`mn_count` bigint COMMENT '当月用户数量',
`is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
`is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '每日活跃用户数量'
stored as parquet
location '/warehouse/gmall/ads/ads_uv_count_day/'
;
2)导入数据
hive (gmall)>
insert overwrite table ads_uv_count
select
'2019-02-10' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('2019-02-10','MO'),-1)='2019-02-10','Y','N') ,
if(last_day('2019-02-10')='2019-02-10','Y','N')
from
(
select
'2019-02-10' dt,
count(*) ct
from dws_uv_detail_day
where dt='2019-02-10'
)daycount join
(
select
'2019-02-10' dt,
count (*) ct
from dws_uv_detail_wk
where wk_dt=concat(date_add(next_day('2019-02-10','MO'),-7),'_' ,date_add(next_day('2019-02-10','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
select
'2019-02-10' dt,
count (*) ct
from dws_uv_detail_mn
where mn=date_format('2019-02-10','yyyy-MM')
)mncount on daycount.dt=mncount.dt
;
ADS层加载数据脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$APP".ads_uv_count
select
'$log_date' dt,
daycount.ct,
wkcount.ct,
mncount.ct,
if(date_add(next_day('$log_date','MO'),-1)='$log_date','Y','N') ,
if(last_day('$log_date')='$log_date','Y','N')
from
(
select
'$log_date' dt,
count(*) ct
from "$APP".dws_uv_detail_day
where dt='$log_date'
)daycount join
(
select
'$log_date' dt,
count (*) ct
from "$APP".dws_uv_detail_wk
where wk_dt=concat(date_add(next_day('$log_date','MO'),-7),'_' ,date_add(next_day('$log_date','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
select
'$log_date' dt,
count (*) ct
from "$APP".dws_uv_detail_mn
where mn=date_format('$log_date','yyyy-MM')
)mncount on daycount.dt=mncount.dt;
"
$hive -e "$sql"
需求二:用户新增主题
1)建表语句
hive (gmall)>
drop table if exists `dws_new_mid_day`;
create table `dws_new_mid_day`
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '创建时间'
) COMMENT '每日新增设备信息'
stored as parquet
location '/warehouse/gmall/dws/dws_new_mid_day/';
2)导入数据
用每日活跃用户表 left join 每日新增设备表,关联的条件是mid_id相等。如果是每日新增的设备,则在每日新增设备表中为null。
hive (gmall)>
insert into table dws_new_mid_day
select
ud.mid_id,
ud.user_id ,
ud.version_code ,
ud.version_name ,
ud.lang ,
ud.source,
ud.os,
ud.area,
ud.model,
ud.brand,
ud.sdk_version,
ud.gmail,
ud.height_width,
ud.app_time,
ud.network,
ud.lng,
ud.lat,
'2019-02-10'
from dws_uv_detail_day ud left join dws_new_mid_day nm on ud.mid_id=nm.mid_id
where ud.dt='2019-02-10' and nm.mid_id is null;
ADS层(每日新增设备表)
1)建表语句
hive (gmall)>
drop table if exists `ads_new_mid_count`;
create table `ads_new_mid_count`
(
`create_date` string comment '创建时间' ,
`new_mid_count` BIGINT comment '新增设备数量'
) COMMENT '每日新增设备信息数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_mid_count/';
2)导入数据
hive (gmall)>
insert into table ads_new_mid_count
select create_date , count(*) from dws_new_mid_day
where create_date='2019-02-10'
group by create_date ;
需求三:用户留存主题
用户留存概念
需求描述
DWS层
1)建表语句
hive (gmall)>
drop table if exists `dws_user_retention_day`;
create table `dws_user_retention_day`
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '设备新增时间',
`retention_day` int comment '截止当前日期留存天数'
) COMMENT '每日用户留存情况'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_retention_day/'
;
2)导入数据(每天计算前1天的新用户访问留存明细)
hive (gmall)>
insert overwrite table dws_user_retention_day partition(dt="2019-02-11")
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1);
DWS层(1,2,3,n天留存用户明细表)
1)导入数据(每天计算前1,2,3,n天的新用户访问留存明细)
hive (gmall)>
insert overwrite table dws_user_retention_day partition(dt="2019-02-11")
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-1)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
2 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-2)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
3 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-02-11' and nm.create_date=date_add('2019-02-11',-3);
ADS层
留存用户数
1)建表语句
hive (gmall)>
drop table if exists `ads_user_retention_day_count`;
create table `ads_user_retention_day_count`
(
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
retention_count bigint comment '留存数量'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
2)导入数据
hive (gmall)>
insert into table ads_user_retention_day_count
select
create_date,
retention_day,
count(*) retention_count
from dws_user_retention_day
where dt='2019-02-11'
group by create_date,retention_day;
留存用户比率
1)建表语句
hive (gmall)>
drop table if exists `ads_user_retention_day_rate`;
create table `ads_user_retention_day_rate`
(
`stat_date` string comment '统计日期',
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量',
`new_mid_count` string comment '当日设备新增数量',
`retention_ratio` decimal(10,2) comment '留存率'
) COMMENT '每日用户留存情况'
stored as parquet
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
2)导入数据
hive (gmall)>
insert into table ads_user_retention_day_rate
select
'2019-02-11' ,
ur.create_date,
ur.retention_day,
ur.retention_count ,
nc.new_mid_count,
ur.retention_count/nc.new_mid_count*100
from
(
select
create_date,
retention_day,
count(*) retention_count
from `dws_user_retention_day`
where dt='2019-02-11'
group by create_date,retention_day
) ur join ads_new_mid_count nc on nc.create_date=ur.create_date;