转载自:https://blog.csdn.net/myloveprogrmming/article/details/85813114
1、获取数据源
后台的服务数据
前台的点击流日志数据
业务数据
2、确定主题
用户主题:用户、会员相关的信息
订单主题:订单相关的信息
浏览器主题:跟浏览器相关的信息
事件主题:跟事件相关
3、创建模型(创建表)
100张表以内不分层,以外将分层。
如果维度表较多则将维度表单独分层
1、创建维度层:
dim_维度表
地域维度:拆分成 省 市
浏览器维度:浏览器名称、版本
时间维度:周、季度
事件维度:category(种类)、action
平台维度:
kpi:(指标)
创建ods库:注:ods=operate database store
创建dw库:dw=data warehouse 数据仓库
创建dm库: dm=date manipulation数据操作
命名规则:
库名_主题_表名
ods_user_addr_
ODS_USER_ADDR_(大小写均可,但最好小写)
ods.user.addr不行
事实表
create database if not exists dim;//纬度
create database if not exists ods;//操作数据库存储
create database if not exists dw;//数据仓库
create database if not exists dm;//数据操作
在维度库下面创建维度表:注:字段带有特殊符号用这个解决,这里desc是关键字,所以用
CREATE TABLE IF NOT EXISTS dim_province
(
id
int,
province
string,
country_id
int,
desc
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_city
(
id
int,
city
string,
desc
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_province_city
(
dim_region_id
bigint,
dim_region_city_name
string,
dim_region_province_name
string,
dim_region_country_name
string,
dim_region_city_id
string,
dim_region_province_id
string,
dim_region_country_id
string,
dim_region_date
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_platform
(
id
int,
platform_name
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_kpi
(
id
int,
kpi_name
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_event_name
(
id
int,
name
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_event_category
(
id
int,
category
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_event_action
(
id
int,
action
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_browser_name
(
id
int,
browser_name
string,
browser_version_id
int
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_browser_version
(
id
int,
browser_version
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_order
(
oid
bigint,
on
string,
cut_id
bigint,
cua_id
bigint,
browser_version
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_currency_type
(
id
int,
currency_name
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_payment_type
(
id
int,
payment_type
string
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_week
(
STD_WEEK_CODE STRING,
STD_WEEK_NAME STRING,
BEGIN_DATE STRING,
END_DATE STRING,
NOTES STRING,
IS_DISPLAY INT,
DISPLAY_ORDER INT,
IS_VALID INT,
UPDATE_DATE STRING,
LAST_STD_WEEK_CODE STRING
)
row format delimited fields terminated by ‘\t’
;
CREATE TABLE IF NOT EXISTS dim_userinfo
(
uid
String,
uname
string
)
row format delimited fields terminated by ‘\t’
;
load data local inpath ‘/root/dim/dim_province’ into table dim_province;
load data local inpath ‘/root/dim/dim_city’ into table dim_city;
load data local inpath ‘/root/dim/dim_province_city’ into table dim_province_city;
load data local inpath ‘/root/dim/dim_platform’ into table dim_platform;
load data local inpath ‘/root/dim/dim_kpi’ into table dim_kpi;
load data local inpath ‘/root/dim/dim_event_name’ into table dim_event_name;
load data local inpath ‘/root/dim/dim_browser_name’ into table dim_browser_name;
load data local inpath ‘/root/dim/dim_browser_version’ into table dim_browser_version;
load data local inpath ‘/root/dim/dim_userinfo’ into table dim_userinfo;
在ods层创建数据表:这里用到对元数据进行处理(UDF)
create table if not exists ods_font_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;
create table if not exists ods_end_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;
create table if not exists ods_logs(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\t’
stored as orc
;
insert into ods_logs
select
from ods_font_log
union all
select
from ods_end_log
;
udf:
ip
url解析
或者用mr清洗数据:
create table if not exists ods_logs(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on
string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;
注:
删除内部表会直接删除元数据(metadata)及存储数据;
删除外部表仅仅会删除元数据,HDFS上的文件并不会被删除;
create external table if not exists ods_logs_orc(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on
string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;
导入原始数据:
#load data inpath ‘/ods/11/09/’ into table ods_logs;
我因为没跑mr程序所以没有这个数据,让同学给了一份:/root/dim/part-m-00000
load data local inpath ‘/root/dim/part-m-00000’ into table ods_logs;
将数据导入到orc表中
from ods_logs
insert into ods_logs_orc(
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
on
,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
) partition (month=1,day=‘01’)
select
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
on
,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
;
上面的报错,可以直接简写为:
from ods_logs
insert into ods_logs_orc partition(month=1,day=1)
select *
;
创建dw层的数据
##为新增用户、总用户、活跃用户做计算:
create table if not exists dwd_user(
pl string,
pl_id string,
en string,
en_id string,
browser_name string,
browser_id string,
browser_version string,
browser_version_id string,
province_name string,
province_id string,
city_name string,
city_id string,
uid string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;
导入数据:
from (
select
ol.pl,
dp.id as pl_id,
ol.en,
de.id as en_id,
ol.browser_name,
db.id browser_id,
ol.browser_version ,
dv.id browser_version_id,
ol.province ,
pv.id province_id,
ol.city ,
dc.id city_id,
ol.u_ud
from ods.ods_logs_orc ol
left join dim.dim_platform dp on dp.platform_name = ol.pl
left join dim.dim_event_name de on de.name = ol.en
left join dim.dim_browser_name db on db.browser_name = ol.browser_name
left join dim.dim_browser_version dv on dv.browser_version = ol.browser_version
left join dim.dim_province pv on pv.province = ol.province
left join dim.dim_city dc on dc.city = ol.city
) tmp
insert into dwd_user partition (month=‘1’,day = ‘01’)
select *
;
另一种写法:
with tmp as (
select
ol.pl,
dp.id as pl_id,
ol.en,
de.id as en_id,
ol.browser_name,
db.id browser_id,
ol.browser_version ,
dv.id browser_version_id,
ol.province_name ,
pv.id province_id,
ol.city_name ,
dc.id city_id,
ol.uid
from ods_logs_orc ol
left join dim_platform dp on dp.platform_name = ol.pl
left join dim_event_name de on de.name = ol.en
left join dim_browser_name db on db.browser_name = ol.browser_Name
left join dim_browser_version dv on dv.browser_version = ol.browser_version
left join dim_province pv on pv.province = ol.province_name
left join dim_city dc on dc.city = ol.city_name
)
insert into dwd_user partition (month=‘1’,day = ‘01’)
select * from tmp
;
————————————————
版权声明:本文为CSDN博主「大岛君」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/myloveprogrmming/article/details/85813114