hive数仓构建

转载自:https://blog.csdn.net/myloveprogrmming/article/details/85813114
1、获取数据源
后台的服务数据
前台的点击流日志数据
业务数据

2、确定主题
用户主题:用户、会员相关的信息
订单主题:订单相关的信息
浏览器主题:跟浏览器相关的信息
事件主题:跟事件相关

3、创建模型(创建表)
100张表以内不分层,以外将分层。
如果维度表较多则将维度表单独分层

1、创建维度层:
dim_维度表
地域维度:拆分成 省 市
浏览器维度:浏览器名称、版本
时间维度:周、季度
事件维度:category(种类)、action
平台维度:
kpi:(指标)

创建ods库:注:ods=operate database store
创建dw库:dw=data warehouse 数据仓库
创建dm库: dm=date manipulation数据操作

命名规则:
库名_主题_表名
ods_user_addr_
ODS_USER_ADDR_(大小写均可,但最好小写)
ods.user.addr不行
事实表
create database if not exists dim;//纬度
create database if not exists ods;//操作数据库存储
create database if not exists dw;//数据仓库
create database if not exists dm;//数据操作

在维度库下面创建维度表:注:字段带有特殊符号用这个解决,这里desc是关键字,所以用
CREATE TABLE IF NOT EXISTS dim_province (
id int,
province string,
country_id int,
desc string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_city (
id int,
city string,
desc string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_province_city (
dim_region_id bigint,
dim_region_city_name string,
dim_region_province_name string,
dim_region_country_name string,
dim_region_city_id string,
dim_region_province_id string,
dim_region_country_id string,
dim_region_date string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_platform (
id int,
platform_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_kpi (
id int,
kpi_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_name (
id int,
name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_category (
id int,
category string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_action (
id int,
action string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_browser_name (
id int,
browser_name string,
browser_version_id int
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_browser_version (
id int,
browser_version string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_order (
oid bigint,
on string,
cut_id bigint,
cua_id bigint,
browser_version string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_currency_type (
id int,
currency_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_payment_type (
id int,
payment_type string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_week(
STD_WEEK_CODE STRING,
STD_WEEK_NAME STRING,
BEGIN_DATE STRING,
END_DATE STRING,
NOTES STRING,
IS_DISPLAY INT,
DISPLAY_ORDER INT,
IS_VALID INT,
UPDATE_DATE STRING,
LAST_STD_WEEK_CODE STRING
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_userinfo (
uid String,
uname string
)
row format delimited fields terminated by ‘\t’
;

load data local inpath ‘/root/dim/dim_province’ into table dim_province;
load data local inpath ‘/root/dim/dim_city’ into table dim_city;
load data local inpath ‘/root/dim/dim_province_city’ into table dim_province_city;
load data local inpath ‘/root/dim/dim_platform’ into table dim_platform;
load data local inpath ‘/root/dim/dim_kpi’ into table dim_kpi;
load data local inpath ‘/root/dim/dim_event_name’ into table dim_event_name;
load data local inpath ‘/root/dim/dim_browser_name’ into table dim_browser_name;
load data local inpath ‘/root/dim/dim_browser_version’ into table dim_browser_version;
load data local inpath ‘/root/dim/dim_userinfo’ into table dim_userinfo;


在ods层创建数据表:这里用到对元数据进行处理(UDF)
create table if not exists ods_font_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;

create table if not exists ods_end_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;

create table if not exists ods_logs(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\t’
stored as orc
;

insert into ods_logs
select
from ods_font_log
union all
select
from ods_end_log
;

udf:
ip
url解析
或者用mr清洗数据:

create table if not exists ods_logs(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;
注:
删除内部表会直接删除元数据(metadata)及存储数据;
删除外部表仅仅会删除元数据,HDFS上的文件并不会被删除;

create external table if not exists ods_logs_orc(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;

导入原始数据:
#load data inpath ‘/ods/11/09/’ into table ods_logs;
我因为没跑mr程序所以没有这个数据,让同学给了一份:/root/dim/part-m-00000
load data local inpath ‘/root/dim/part-m-00000’ into table ods_logs;

将数据导入到orc表中

from ods_logs
insert into ods_logs_orc(
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
on ,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
) partition (month=1,day=‘01’)
select
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
on ,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
;

上面的报错,可以直接简写为:
from ods_logs
insert into ods_logs_orc partition(month=1,day=1)
select *
;

创建dw层的数据
##为新增用户、总用户、活跃用户做计算:
create table if not exists dwd_user(
pl string,
pl_id string,
en string,
en_id string,
browser_name string,
browser_id string,
browser_version string,
browser_version_id string,
province_name string,
province_id string,
city_name string,
city_id string,
uid string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;

导入数据:
from (
select
ol.pl,
dp.id as pl_id,
ol.en,
de.id as en_id,
ol.browser_name,
db.id browser_id,
ol.browser_version ,
dv.id browser_version_id,
ol.province ,
pv.id province_id,
ol.city ,
dc.id city_id,
ol.u_ud
from ods.ods_logs_orc ol
left join dim.dim_platform dp on dp.platform_name = ol.pl
left join dim.dim_event_name de on de.name = ol.en
left join dim.dim_browser_name db on db.browser_name = ol.browser_name
left join dim.dim_browser_version dv on dv.browser_version = ol.browser_version
left join dim.dim_province pv on pv.province = ol.province
left join dim.dim_city dc on dc.city = ol.city
) tmp
insert into dwd_user partition (month=‘1’,day = ‘01’)
select *
;


另一种写法:
with tmp as (
select
ol.pl,
dp.id as pl_id,
ol.en,
de.id as en_id,
ol.browser_name,
db.id browser_id,
ol.browser_version ,
dv.id browser_version_id,
ol.province_name ,
pv.id province_id,
ol.city_name ,
dc.id city_id,
ol.uid
from ods_logs_orc ol
left join dim_platform dp on dp.platform_name = ol.pl
left join dim_event_name de on de.name = ol.en
left join dim_browser_name db on db.browser_name = ol.browser_Name
left join dim_browser_version dv on dv.browser_version = ol.browser_version
left join dim_province pv on pv.province = ol.province_name
left join dim_city dc on dc.city = ol.city_name
)
insert into dwd_user partition (month=‘1’,day = ‘01’)
select * from tmp
;

————————————————
版权声明:本文为CSDN博主「大岛君」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/myloveprogrmming/article/details/85813114

Hadoop Hive构建在 Hadoop 生态系统上的数据仓库基础设施,它提供了一种类似于 SQL 的查询语言,用于处理和分析存储在 Hadoop 分布式文件系统(HDFS)中的大规模数据。 Hive 的基本原理如下: 1. 数据存储:Hive 将数据以表的形式存储在 HDFS 上,数据可以是结构化、半结构化或非结构化的。Hive 表定义包含表的结构(例如列和数据类型)以及数据的存储位置。 2. 元数据管理:Hive 使用元数据来管理表和表之间的关系。元数据包括表的结构、分区信息、表的存储位置等。Hive 的元数据可以使用自带的 Derby 数据库或者外部数据库(如 MySQL)进行存储。 3. 查询优化与执行:Hive 提供了类似于 SQL 的查询语言 HiveQL,用户可以使用 HiveQL 编写查询语句。当用户提交查询时,Hive 会进行查询优化,生成一个逻辑执行计划,并将其转换为 MapReduce 作业或 Tez 任务来执行。 4. 数据转换与计算:Hive 支持用户定义的函数(UDF)和自定义聚合函数(UDAF),用户可以使用这些函数来进行数据转换和计算。Hive 还提供了一些内置函数,用于常见的数据操作和处理。 5. 数据分区与分桶:Hive 支持数据的分区和分桶,可以根据某个列的值将数据划分为多个分区,或者根据某个列的哈希值将数据划分为多个桶。这样可以提高查询性能和数据处理效率。 通过上述原理,Hive 提供了一种方便的方式来处理和分析大规模数据,并且能够与其他 Hadoop 生态系统工具(如Hadoop MapReduce、HBase等)进行无缝集成。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值