一、需求分析
二、分析指标 PV UV 登录人数 游客人数 平均访问时长 二跳率
PV :有多少用户访问了页面(一次访问记做一次记录)
UV :有多少用户访问了页面(不管访问多少页面)
登录人数 :会员人数
游客人数 :非会员人数
平均时长 :每个用户开始访问时间到离开时间平均值
二跳率 :用户点击页面次数大于等于2
三、实现
a. HIVE创建数据库
create database if not exists onehd_shop;
use onehd_shop;
create table if not exists yhd_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
partitioned by (date string)
row format delimited fields terminated by '\t'
stored as textfile
load data local inpath '/home/liuwl/opt/datas/2015082818' into table yhd_source partition (date='2015082818');
load data local inpath '/home/liuwl/opt/datas/2015082819' into table yhd_source partition (date='2015082819');
b. 过滤字段
-> 建立use表
create table if not exists session_info(
session_id string,
guid string,
trackerU string,
landing_url string,
landing_url_ref string,
user_id string,
pv string,
stay_time string,
min_trackTime string,
ip string,
provinceId string
) partitioned by (date string)
row format delimited fields terminated by '\t'
stored as textfile
从use表中看到某些字段不能被直接得出
所以需要使用建立临时表过渡一下
-> 建立session_tmp临时表
create table session_tmp as
select
a.sessionId session_id ,
max(a.guid) guid,
max(a.endUserId) user_id ,
count(a.url) pv ,
(unix_timestamp(max(a.trackTime)) - unix_timestamp(min(a.trackTime))) stay_time ,
min(a.trackTime) min_trackTime ,
max(a.ip) ip ,
max(a.provinceId) provinceId
from yhd_source a
where date='2015082818'
group by a.sessionId;
-> 为了job的执行效率可以建立一个从source过滤出来的表track_tmp
create table track_tmp as
select
sessionId,
trackTime,
trackerU,
url,
referer
from yhd_source
where date='2015082818';
-> 两张表进行join
insert overwrite table session_info partition (date='2015082818')
select
a.session_id session_id,
max(a.guid) guid,
max(b.trackerU) trackerU,
max(b.url) landing_url,
max(b.referer) landing_url_ref,
max(a.user_id) user_id ,
max(a.pv) pv,
max(a.stay_time) stay_time,
max(a.min_trackTime) min_trackTime,
max(a.ip) ip,
max(a.provinceId) provinceId
from session_tmp a join track_tmp b
on a.session_id = b.sessionId and a.min_trackTime = b.trackTime
group by a.session_id;
-> 对session_info进行分析需求指标
create table if not exists one_result as
select
date date,
sum(pv) pv,
count(distinct guid) uv,
count(distinct case when user_id != '' then guid else null end) login_user,
count(distinct case when user_id = '' then guid else null end) visit_user,
avg(stay_time) avg_stay,
count(case when pv>=2 then session_id else null end )/count(session_id) second_reate ,
count(distinct ip) ip
from session_info
where date='2015082818'
group by date;
需要注意的是
登录人数与游客人数条件中user_id在源数据表中可能不以null出现
四、结果:
日期 PV UV 登录人数 游客人数 平均访问时长 二跳率 独立IP
2015082818 64972.0 23928 11586 12367 49.74171774059963 0.5096074209044227 19174