离线Hive数据仓库学习
hbase(main):005:0> count 'events_db:train'
flume -> kafka -> hbase -> hive
[root@gree139 hive110]# nohup ./bin/hive --service hiveserver2 &
[root@gree139 hive110]# nohup ./bin/hive --service metastore &
//避免在map阶段聚合
set hive.auto.convert.join=false
确认业务 --> 声明粒度 --> 确认维度 --> 确认事实
ODS 数据准备区 贴源层 //原始数据
DWD 细节数据层 // 脱敏/清洗
DWS 数据服务层 //轻度汇总 聚合
DM //大宽表,基于某个业务,把不同维度的信息通过粒度聚合起来
DWT 主题层 大宽表
ADS //数据应用层,精确数据,为不同用户所使用
//创建
create database dwd_events;
use dwd_events;
// 处理 user ,将hbase中的users表映射到hive中,再以orc格式保存到hive表中
create external table hb_users(
userid string,
birthyear int,
gender string,
locale string,
joinedAt string,
location string,
timezone string
)stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' with serdeproperties
('hbase.columns.mapping'=':key,profile:birthyear,profile:gender,region:locale,registration:joinedAt,region:location,region:timezone')
tblproperties ('hbase.table.name'='events_db:users');
select * from hb_users;
create table users stored as orc as
select * from hb_users;
select * from users;
drop table if exists hb_users;
// events 将hbase中 events表数据映射到hive表中,并将以orc格式保存
create external table hb_events(
eventid string,
user_id string,
city string,
state string,
zip string,
country string,
lat float,
lng float,
common_words string,
start_time string
) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=":key,creator:user_id,location:city,location:state,location:zip,location:country,location:lat,location:lng,remark:common_words,schedule:start_time")
tblproperties ('hbase.table.name'='events_db:events');
select * from hb_events;
create table events stored as orc as
select * from hb_events;
select * from events;
drop table if exists hb_events;
show tables;
//eventattend
create external table hb_event_attendee(
row_key string,
eventid string,
friendid string,
state string
) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key, euat:eventid,euat:friendid,euat:state')
tblproperties ('hbase.table.name'='events_db:event_attendee');
create table event_attendee stored as orc as
select * from hb_event_attendee;
select * from event_attendee;
drop table if exists hb_event_attendee;
// userfriend
create external table hb_user_friend(
row_key string,
userid string,
friendid string
) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key, uf:userid,uf:friendid')
tblproperties ('hbase.table.name'='events_db:user_friend');
drop table if exists user_friend;
create table user_friend stored as orc as
select * from hb_user_friend;
drop table if exists hb_user_friend;
// train
create external table hb_train(
rowkey string,
userid string,
eventid string,
invited string,
timestamp string,
interested string
) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key, eu:user,eu:event,eu:invited,eu:timestamp,eu:interested')
tblproperties ('hbase.table.name'='events_db:train');
select * from hb_train where userid!='user';
use dwd_events;
drop table if exists train;
create table train stored as orc as
select * from hb_train where userid!='user';
select count(*) from train;
select * from train where userid='user';
drop table if exists hb_train;
/**
[root@gree139 kb15tmp]# hdfs dfs -mkdir -p /events/ods/locale
22/01/11 15:28:31 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[root@gree139 kb15tmp]# hdfs dfs -mkdir -p /events/ods/timezone
22/01/11 15:28:38 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[root@gree139 kb15tmp]# hdfs dfs -put ./locale.txt /events/ods/locale
22/01/11 15:28:54 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[root@gree139 kb15tmp]# hdfs dfs -put ./time_zone.csv /events/ods/timezone
22/01/11 15:29:10 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
*/
create external table ods_local(
local_id int,
locale string
) row format delimited fields terminated by "\t"
location '/events/ods/locale';
select * from ods_local;
create table local stored as orc as
select * from ods_local;
select * from local;
drop table if exists ods_local;
// time_zone
create external table ods_time_zone(
time_zone_id int,
time_zone string
)row format delimited fields terminated by ','
location '/events/ods/timezone';
select * from ods_time_zone;
create table time_zone stored as orc as
select * from ods_time_zone;
drop table if exists ods_time_zone;
select * from time_zone;
use dws_events;
-- user_attend_event_count
-- 基本event_attendee 统计 每个user 分别接受到的事件邀请的个数,参加事件的个数,拒绝事件的个数,可能参加事件的个数
-- userid invited_count maybe_attend_count no_attend_count yes_attend_count
-- 111 12 0 0 12
-- 222 10 0 10 0
create table user_attend_event_count stored as orc as
select attend_user_id as userid,
sum(invited) as invited_count,
sum(maybe_attend) as maybe_attend_count,
sum(no_attend) as no_attend_count,
sum(yes_attend) as yes_attend_count
from user_event_state
group by attend_user_id;