1、创建日志原始表
-- create table
drop table if exists hive_ycong.track_log ;
create table hive_ycong.track_log(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
partitioned by(date string, hour string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;
2、导入数据
set parquet.compression=SNAPPY ;
load data local inpath '/data/tracklogs/20160321/2016032119' overwrite
into table hive_ycong.track_log partition(date = '20160321' ,hour = '19');
load data local inpath '/data/tracklogs/20160321/2016032120' overwrite
into table hive_ycong.track_log partition(date = '20160321' ,hour = '20');
3、创建会话临时表session_info_temp
drop table if exists hive_ycong.session_info_temp ;
create table hive_ycong.session_info_temp as
select
a.sessionid session_id ,
max(a.guid) guid,
max(a.enduserid) user_id ,
count(a.url) pv ,
(unix_timestamp(max(a.tracktime)) - unix_timestamp(min(a.tracktime))) stay_time,
min(a.tracktime) min_trackTime ,
max(a.ip) ip,
max(a.provinceid) provinceId
from hive_ycong.track_log a where date = '20160321'
group by sessionid;**
4、创建会话临时表session_url_temp
drop table if exists hive_ycong.session_url_temp ;
create table hive_ycong.session_url_temp as
select
sessionid,
tracktime,
trackeru,
url,
referer
from hive_ycong.track_log where date = '20160321';
5、创建会话表session_info,并加载数据
create table hive_ycong.session_info(
session_id string ,
guid string ,
trackerU string ,
landing_url string ,
landing_url_ref string ,
user_id string ,
pv string ,
stay_time string ,
min_trackTime string ,
ip string ,
provinceId string
)
partitioned by (date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
insert overwrite table hive_ycong.session_info partition(date = '20160321')
select
a.session_id session_id ,
max(a.guid) guid ,
max(b.trackeru) trackerU ,
max(b.url) landing_url ,
max(b.referer) landing_url_ref ,
max(a.user_id) user_id ,
max(a.pv) pv ,
max(a.stay_time / 1000) stay_time ,
max(a.min_trackTime) min_trackTime ,
max(a.ip) ip ,
max(a.provinceId) provinceId
from hive_ycong.session_info_temp a
join hive_ycong.session_url_temp b on a.session_id = b.sessionid and a.min_trackTime = b.tracktime
group by a.session_id;
6、结果分析:日期 UV PV 登陆人数 游客人数 平均访问时长 二跳率 独立IP数
drop table if exists hive_ycong.visit_daily_temp ;
create table hive_ycong.visit_daily_temp as
select
date,
count(distinct guid) uv,
sum(pv) pv,
count(distinct case when user_id != '' then user_id else null end) login_users,
count(distinct case when user_id = '' then guid else null end) visit_users,
avg(stay_time) avg_stay_time,
count(case when pv >= 2 then session_id else null end) / count(session_id) second_rate,
count(distinct ip) ip_number
from hive_ycong.session_info where date = '20160321'
group by date;
- 没有权限的用户
grant all on *.* to root@'bigdata.eclipse.com' identified by '123456'