一、数据仓库模型
| valid | string | |
| ip | string | |
| user_id | string | |
| time | string | |
| daystr | string | |
| timestr | string | |
| month | string | |
| day | string | |
| hour | string | |
| request | string
注意:构建星型模型的好处:虽然会有一定的数据冗余,
但是数据分析时候不用关联更深层次的表进行数据处理,处理数据更高效。
星型模型:时间维度:年、季度、月、周、天、小时
雪花模型:时间维度:时间
浏览分析
统计每天每小时的PV数
分析维度:天、小时
select
daystr,
hour ,
count(request) as pv
from
weblog_detail
group by
daystr,hour;
统计每天的pv数
select
daystr,
count(request) as pv
from
weblog_detail
group by
daystr;
统计每月的pv数
注意:注意:如果做了分区这么写可以,如果没有需要在分组字段上面加上year
select
month,
count(request) as pv
from
weblog_detail
group by
month where year='';
--如果分区字段:年/月/日
月:01-12
--如果分区字段:年/月/日
月:01-12
select year ,month ,count(request) as pv from table group by year,month;
--如果分区字段:日(yyyy-MM-dd)
月:yyyy-MM
年/月:yyyy/MM
统计不同终端维度的PV数
user_agent:
browser_t/browser_v
os_t/os_v
统计不同浏览器终端的pv数据
select
browser_t,
browser_v,
count(request) as pv
from
weblog_detail
group by
browser_t,browser_v;
select user_agent,count(request) as pv from weblog_detail group by user_agent;
统计不同终端类型的pv数据
统计谷歌浏览器的不同终端的pv数
select
user_agent,count(request) as pv
from
weblog_detail where user_agent
like
'%Chrome%' and user_agent like "%Linux%" group by user_agent;
统计每个月每个终端的pv数
select
month,user_agent ,count(request) as pv
from
weblog_detail
group by
month,user_agent;
统计每天人均pv数
本案例中以ip代表用户,实际工作中有用户id字段
总pv/用户个数
select
daystr,
count(request)/count(distinct ip) as avg_pv
from
weblog_detail
group by
daystr;
基于visit模型进行统计
select
daystr,
sum(pageNum)/count(distinct ip) as avg_pv
from
click_visit
group by
daystr;
注意:解析:sum(pageNum)已经提前聚合过了,效率比count(request)高很多。
统计每天每个来源的pv数
select
daystr,
ref_host,count(request) as pv
from
weblog_detail
where
ref_host is not null
group by
daystr,ref_host
order by
pv desc;
统计每天每个小时来源中产生最多的pv的的TOP3
2018-05-23 00 www.baidu.com 90888
2018-05-23 00 www.qq.com 90123
2018-05-23 01 www.baidu.com 88888
2018-05-23 01 www.qq.com 34567
分析:分组后排序,窗口函数
分组:
group by:分组后只有一条
窗口函数:row_number()
先求出每天每个小时每个来源的pv数
select
tmp2.daystr,
tmp2.hour,
tmp2.ref_host,
tmp2.pv,
tmp2.numb
from (
select
tmp1.daystr,
tmp1.hour,
tmp1.ref_host,
tmp1.pv,
row_number() over (partition by tmp1.daystr,tmp1.hour order by tmp1.pv desc )
as numb
from(
select
daystr,hour,ref_host ,count(request) as pv
from
weblog_detail
where
ref_host is not null
group by
daystr,hour,ref_host
) tmp1
) tmp2 where tmp2.numb < 4;
受访分析
统计每日热门的页面的topN
统计每天的每个页面的访问数量
select
tmp1.daystr,tmp1.request,tmp1.pv,
row_number() over (partition tmp1.daystr order by tmp1.pv) as numb
from
(select
daystr,request,count(*) as pv
from
weblog_detail
group by
daystr ,request)
tmp1;
访客分析
每个访客每小时的访问数
本题中的ip应该替换为用户id字段
select
ip,hour,count(request) as pv
from
weblog_detail
group by ip,hour;
每个小时的ip访问数
select
ip,daystr,hour,count(request) as pv
from
weblog_detail
group by
ip,daystr,hour;
每天的/每月的ip访问数
select ip,daystr,count(request) as pv from weblog_detail group by ip,daystr;
select ip,month,count(request) as pv from weblog_detail group by ip,month;
统计新增访客
如何区分一个用户时老用户还是新增用户?
新增用户:从来没有访问过,第一次访问的用户
与老用户的区别:有没有客户id
guid:访客id
在程序中通过读取cookie,判断当前访问是否存在guid
如果不存在,表示是一个新用户
缺点:清除cookie导致guid丢失,下一次又是新用户
第一种方式:监听所有请求,如果用户提交访问,那么我们读取cookie,如果没有guid,该请求标记为特殊的请求事件,打标签,该请求没有guid,isNew=false
处理日志时判断isNew字段
第二种方式:判断该用户的id是否出现过
创建一张表:history_user
用于记录所有访问过的用户id
让今天所有的访问的用户id与历史表中的id进行join
今天:
user1 user2 user3 user4
history_user:
user1 user4
select a.id as newId from tody_log a left join history_user b on a.id=b.id where b.id is null;
newId oldId
user1 user1
user2 null
user3 null
user4 user4
注意:第二种方式效率低(两种表的数据比较大,join会很消耗资源),第一种方式是公司工作常用的方法。
将新的id最后再插入history_user
新增会员:第一次注册
url:用户访问的所有url中包含注册成功的页面
visit模型分析
单次访客统计
只访问了一次的访客
select
ip ,count(distinct session) as times
from
click_visit
group by
ip
having count(distinct session) = 1;
回头访客个数
访问了不止一次的访客
select
ip ,count(distinct session) as times
from
click_visit
group by
ip
having count(distinct session) > 1;
人均访问频率
select avg(tmp.times) as avg_time from (
select
ip ,count(distinct session) as times
from
click_visit
group by
ip
) as tmp ;
用户平均访问页数
select avg(tmp.pvs) as avg_pv from (
select
ip ,count(request) as pvs
from
weblog_detail
group by
ip
) as tmp ;
select sum(pageNum) /count(distinct ip) as avg_ip from click_visit;
pageview模型分析
每个用户的访问时长
将每个用户访问的所有页面的停留时长进行统计
select ip,sum(staylong) as length from click_pageviews group by ip;
用户的平均访问时长
select sum(staylong) /count(distinct ip) as avg_length from click_pageviews;
访问页面的漏斗
页面转化 分析
step1:https://www.baidu.com/ 1000
step2:http://news.baidu.com/ 800
step3:http://news.baidu.com/guonei 300
step4:http://www.jiemian.com/article/2194607.html 20
统计:每一步的转化及流失
Step1、 /hadoop-mahout
Step2、 /hadoop-hive
Step3、 /hadoop-zookeeper
Step4、 /black-ip-list
结果: rate retained
step1 1000 100% null
step2 800 80% 80%
step3 500 50% 62.5%
step4 200 20% 40%
先求出每一步用户的个数
create table rs_tmp1 (
step string,
numb int
) row format delimited fields terminated by '\t';
insert overwrite table rs_tmp1
select 'step1',count(*) as numb from weblog_detail where request like '/hadoop-hive%'
union all
select 'step2',count(*) as numb from weblog_detail where request like '/hadoop-mahout%'
union all
select 'step3',count(*) as numb from weblog_detail where request like '/hadoop-zookeeper%'
union all
select 'step4',count(*) as numb from weblog_detail where request like '/black-ip-list%';
笛卡尔积
a
b
c
1
2
3
a 1
b 1
c 1
a 2
b 2
create table rs_tmp2 as
select
a.step as r1step,
a.numb as numb1,
b.step as r2step,
b.numb as numb2
from
rs_tmp1 a
join
rs_tmp1 b;
create table rs_tmp3 as
select
r1step,
numb1/numb2 as rate
from
rs_tmp2
where
r2step = 'step1';
create table rs_tmp4 as
select
r1step,
numb1/numb2 as retained
from
rs_tmp2
where
cast(substr(r1step,5,1) as int) - 1 = cast(substr(r2step,5,1) as int);
SELECT
a.step,
a.numb,
b.rate,
c.retained
FROM
rs_tmp1 a
JOIN rs_tmp3 b ON a.step = b.r1step
LEFT JOIN rs_tmp4 c ON b.r1step = c.r1step;
访客
流失:上一个时间维度访问了,当前时间维度没有访问
回流:上一个时间维度没有访问,当前时间维度访问了
会员
会话
订单
来源