hive数仓构建

最新推荐文章于 2024-08-27 23:41:00 发布

阿鸿的啊

最新推荐文章于 2024-08-27 23:41:00 发布

阅读量1.1k

点赞数 1

文章标签： hive数仓搭建

原文链接：https://blog.csdn.net/myloveprogrmming/article/details/85813114

版权

转载自：https://blog.csdn.net/myloveprogrmming/article/details/85813114
1、获取数据源
后台的服务数据
前台的点击流日志数据
业务数据

2、确定主题
用户主题：用户、会员相关的信息
订单主题：订单相关的信息
浏览器主题：跟浏览器相关的信息
事件主题：跟事件相关

3、创建模型（创建表）
100张表以内不分层，以外将分层。
如果维度表较多则将维度表单独分层

1、创建维度层：
dim_维度表
地域维度：拆分成省市
浏览器维度：浏览器名称、版本
时间维度：周、季度
事件维度：category(种类)、action
平台维度：
kpi：（指标）

创建ods库：注：ods=operate database store
创建dw库：dw=data warehouse 数据仓库
创建dm库： dm=date manipulation数据操作

命名规则：
库名_主题_表名
ods_user_addr_
ODS_USER_ADDR_(大小写均可，但最好小写)
ods.user.addr不行
事实表
create database if not exists dim;//纬度
create database if not exists ods;//操作数据库存储
create database if not exists dw;//数据仓库
create database if not exists dm;//数据操作

在维度库下面创建维度表：注：字段带有特殊符号用这个解决,这里desc是关键字，所以用
CREATE TABLE IF NOT EXISTS dim_province (
id int,
province string,
country_id int,
desc string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_city (
id int,
city string,
desc string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_province_city (
dim_region_id bigint,
dim_region_city_name string,
dim_region_province_name string,
dim_region_country_name string,
dim_region_city_id string,
dim_region_province_id string,
dim_region_country_id string,
dim_region_date string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_platform (
id int,
platform_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_kpi (
id int,
kpi_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_name (
id int,
name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_category (
id int,
category string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_action (
id int,
action string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_browser_name (
id int,
browser_name string,
browser_version_id int
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_browser_version (
id int,
browser_version string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_order (
oid bigint,
on string,
cut_id bigint,
cua_id bigint,
browser_version string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_currency_type (
id int,
currency_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_payment_type (
id int,
payment_type string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_week(
STD_WEEK_CODE STRING,
STD_WEEK_NAME STRING,
BEGIN_DATE STRING,
END_DATE STRING,
NOTES STRING,
IS_DISPLAY INT,
DISPLAY_ORDER INT,
IS_VALID INT,
UPDATE_DATE STRING,
LAST_STD_WEEK_CODE STRING
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_userinfo (
uid String,
uname string
)
row format delimited fields terminated by ‘\t’
;

load data local inpath ‘/root/dim/dim_province’ into table dim_province;
load data local inpath ‘/root/dim/dim_city’ into table dim_city;
load data local inpath ‘/root/dim/dim_province_city’ into table dim_province_city;
load data local inpath ‘/root/dim/dim_platform’ into table dim_platform;
load data local inpath ‘/root/dim/dim_kpi’ into table dim_kpi;
load data local inpath ‘/root/dim/dim_event_name’ into table dim_event_name;
load data local inpath ‘/root/dim/dim_browser_name’ into table dim_browser_name;
load data local inpath ‘/root/dim/dim_browser_version’ into table dim_browser_version;
load data local inpath ‘/root/dim/dim_userinfo’ into table dim_userinfo;

在ods层创建数据表：这里用到对元数据进行处理(UDF)
create table if not exists ods_font_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;

create table if not exists ods_end_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;

create table if not exists ods_logs(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\t’
stored as orc
;

insert into ods_logs
select
from ods_font_log
union all
select
from ods_end_log
;

udf:
ip
url解析
或者用mr清洗数据：

create table if not exists ods_logs(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;
注：
删除内部表会直接删除元数据（metadata）及存储数据；
删除外部表仅仅会删除元数据，HDFS上的文件并不会被删除；

create external table if not exists ods_logs_orc(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;

导入原始数据：
#load data inpath ‘/ods/11/09/’ into table ods_logs;
我因为没跑mr程序所以没有这个数据，让同学给了一份：/root/dim/part-m-00000
load data local inpath ‘/root/dim/part-m-00000’ into table ods_logs;

将数据导入到orc表中

from ods_logs
insert into ods_logs_orc(
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
`on` ,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
) partition (month=1,day=‘01’)
select
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
`on` ,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
;

上面的报错，可以直接简写为：
from ods_logs
insert into ods_logs_orc partition(month=1,day=1)
select *
;

创建dw层的数据
##为新增用户、总用户、活跃用户做计算：
create table if not exists dwd_user(
pl string,
pl_id string,
en string,
en_id string,
browser_name string,
browser_id string,
browser_version string,
browser_version_id string,
province_name string,
province_id string,
city_name string,
city_id string,
uid string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;

导入数据：
from (
select
ol.pl,
dp.id as pl_id,
ol.en,
de.id as en_id,
ol.browser_name,
db.id browser_id,
ol.browser_version ,
dv.id browser_version_id,
ol.province ,
pv.id province_id,
ol.city ,
dc.id city_id,
ol.u_ud
from ods.ods_logs_orc ol
left join dim.dim_platform dp on dp.platform_name = ol.pl
left join dim.dim_event_name de on de.name = ol.en
left join dim.dim_browser_name db on db.browser_name = ol.browser_name
left join dim.dim_browser_version dv on dv.browser_version = ol.browser_version
left join dim.dim_province pv on pv.province = ol.province
left join dim.dim_city dc on dc.city = ol.city
) tmp
insert into dwd_user partition (month=‘1’,day = ‘01’)
select *
;