136.如何进行离线计算-2

最新推荐文章于 2022-11-22 10:46:27 发布

大勇任卷舒

最新推荐文章于 2022-11-22 10:46:27 发布

阅读量192

点赞数

分类专栏： Hadoop 文章标签： hive 大数据 hadoop

@大勇若怯任卷舒

本文链接：https://blog.csdn.net/m0_47454596/article/details/127862715

版权

Hadoop 专栏收录该内容

145 篇文章 3 订阅

订阅专栏

136.1 数据建模

维度建模
专门适用于OLAP的设计模式存在着两种类型的表：事实表维度表
- 事实表：主题的客观度量能够以记录主题为准信息多不精准
- 维度表：看问题分析问题的角度信息精但是不全可跟事实表关系
维度建模三种常见模型
- 星型模型一个事实表带多个维度表维度之间没关系数仓发展建立初期（一个主题）
- 雪花模型一个事实表带多个维度表维度之间可以继续关系维度不利于维护少用
- 星座模型多个事实表带多个维度有些维度可以共用数仓发展后期（多个主题）
不管什么模型，在数仓中，一切有利于数据分析即可为，不用考虑数据冗余性和其他设计规范
模块设计–维度建模
- 在本项目中，因为分析主题只有一个（网站流量日志），所有采用星型模型
- 事实表---->对应清洗完之后的数据
- 维度表----->来自于提前通过工具生成维度表范围要横跨事实表分析维度
- 点击流模型属于业务模型数据既不是事实表也不是维度表是为了后续计算某些业务指标方便而由业务指定
宽表：为了分析，把原来表中某些字段属性提取出来，构成新的字段也称之为明细表
- 窄表：没有扩宽的表原始表
- 宽表数据来自于窄表 insert（宽）+select (窄)
- 总结：hive中，有几种方式可以创建出带有数据的表？
  - create+load data 创建表加载数据（内部表）
  - create +external +location 创建外部表指定数据路径
  - create+insert+select 表的数据来自于后面查询语句返回的结果
  - create+select 创建的表结构和数据来自于后面的查询语句

 # -- hive内置解析url的函数
 
 parse_url_tuple（url,host path,query,queryvalue）
 
 # -- 通常用于把后面的表挂接在左边的表之上 返回成为一个新表
 
 a LATERAL VIEW b 
 LATERAL VIEW
 
 create table t_ods_tmp_referurl as SELECT a.*,b.* FROM ods_weblog_origin a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;

group by 语法限制

select count(*) as pvs from ods_weblog_detail t where datestr='20130918' group by t.hour

select t.hour,count(*) as pvs from ods_weblog_detail t where datestr='20130918' group by t.hour

# -- 在有group by的语句中，出现在select后面的字段要么是分组的字段要么是被聚合函数包围的字段。
解决：
select t.day,t.hour,count(*) as pvs from ods_weblog_detail t where datestr='20130918' group by t.day,t.hour;

136.2 ETL

1.宽表生成

生成ods+url解析表

create table t_ods_tmp_referurl as
SELECT a.*,b.*
FROM ods_weblog_origin a 
LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;

生成ods+url+date解析表

create table t_ods_tmp_detail as 
select b.*,substring(time_local,0,10) as daystr,
substring(time_local,12) as tmstr,
substring(time_local,6,2) as month,
substring(time_local,9,2) as day,
substring(time_local,11,3) as hour
From t_ods_tmp_referurl b;

综合

create table ods_weblog_detail(
valid           string, --有效标识
remote_addr     string, --来源IP
remote_user     string, --用户标识
time_local      string, --访问完整时间
daystr          string, --访问日期
timestr         string, --访问时间
month           string, --访问月
day             string, --访问日
hour            string, --访问时
request         string, --请求的url
status          string, --响应码
body_bytes_sent string, --传输字节数
http_referer    string, --来源url
ref_host        string, --来源的host
ref_path        string, --来源的路径
ref_query       string, --来源参数query
ref_query_id    string, --来源参数query的值
http_user_agent string --客户终端标识
)
partitioned by(datestr string);

insert into table  ods_weblog_detail partition(datestr='20130918') 
select c.valid,c.remote_addr,c.remote_user,c.time_local,
substring(c.time_local,0,10) as daystr,
substring(c.time_local,12) as tmstr,
substring(c.time_local,6,2) as month,
substring(c.time_local,9,2) as day,
substring(c.time_local,12,2) as hour,
c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent
from 
(select a.*,b.*
from ods_weblog_origin a
LATERAL view 
parse_url_tuple(regexp_replace(a.http_referer,"\"",""),'HOST','PATH','QUERY','QUERY_ID')b as ref_host, ref_path, ref_query, ref_query_id) c;

2.DML分析

计算该处理批次（一天）中的各小时 pvs

select 
t.month,t.day,t.hour,count(*)
from ods_weblog_detail t
where t.datestr='20130918'
group by t.month,t.day,t.hour;

计算每天的pvs

select t.month,t.day,count(*) from ods_weblog_detail t where t.datestr='20130918' group by t.month,t.day;

select a.month,a.day,sum(a.pvs)
from 
(
 select 
 t.month as month,t.day as day,t.hour as hour,count(*)  as pvs
 from ods_weblog_detail t
 where t.datestr='20130918'
 group by t.month,t.day,t.hour
) a 
group by a.month,a.day;

统计每小时各来访url产生的pvs

select 
t.day,t.hour,t.http_referer,t.ref_host,count(*)
from ods_weblog_detail t
where datestr='20130918'
group by t.day,t.hour,t.http_referer,t.ref_host
having t.ref_host is not null;

统计每小时各来访host的产生的pv数并排序

select 
t.month,t.day,t.hour,t.ref_host,count(*) as pvs
from ods_weblog_detail t
where datestr='20130918'
group by t.month,t.day,t.hour,t.ref_host
having t.ref_host is not null
order by t.hour asc ,pvs desc;

按照时间维度，统计一天内各小时产生最多pvs的来源（host）topN(分组Top)

select 
a.month,a.day,a.hour,a.host,a.pvs,a.rmp
from
(
 select 
 t.month as month,t.day as day,t.hour as hour,t.ref_host as host,count(*) as pvs,
 row_number()over(partition by concat(t.month,t.day,t.hour) order by pvs desc) rmp
 from ods_weblog_detail t
 where datestr='20130918'
 group by t.month,t.day,t.hour,t.ref_host
 having t.ref_host is not null
 order by hour asc ,pvs desc
)a 
where a.rmp < 4;

统计今日所有来访者平均请求的页面数。

select count(*)/count(distinct remote_addr) from ods_weblog_detail where datestr='20130918';

select
sum(a.pvs)/count(a.ip)
from
(
 select
 t.remote_addr as ip,count(*) as pvs
 from ods_weblog_detail t
 where t.datestr='20130918'
 group by t.remote_addr
) a;

统计每日最热门的页面 top10

select 
t.request,count(*) as counts
from ods_weblog_detail t
where datestr='20130918'
group by t.request
order by counts desc
limit 10;

每日新访客

select 
today.ip
from 
(
 select distinct t.remote_addr as ip 
 from ods_weblog_detail t
) today 
left join history
on today.ip=history.ip
where  history.ip is null;

查询今日所有回头访客及其访问次数（session）

select
remote_addr,count(session) as cs
from ods_click_stream_visit
where datestr='20130918'
group by remote_addr
having cs >1;

人均访问频次

select 
count(session)/count(distinct remote_addr)
from ods_click_stream_visit
where datestr='20130918';

级联查询自join

select 
rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs  
from  dw_oute_numbs rn
inner join 
dw_oute_numbs rr;

# -- 绝对转化

select 
a.rrstep,a.rrnumbs/a.rnnumbs
from 
(
 select 
 rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs  
 from  dw_oute_numbs rn
 inner join dw_oute_numbs rr
)a
where a.rnstep='step1';

# -- 相对转化

select 
tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as leakage_rate
from
(
 select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from 	dw_oute_numbs rn
 inner join 
 dw_oute_numbs rr
) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;