Hbase数据映射到Hive
//新建库 并使用库
create database events;
use events;
//设置变量名为db 指向库名 events
set hivevar:db=events;
//设置允许所有的分区列都是动态分区列00000000000000000000
set hive.exec.dynamic.partition.mode=nonstrict;
//设置允许动态分区功能
SET hive.optimize.sort.dynamic.partition=true;
//关闭map阶段的表关联
SET hive.auto.convert.join=false;
//hive 路径下查询各个文件夹存储情况 或者可以直接用 df -h 来查看磁盘使用情况 防止磁盘过满而引起建表不成功
[root@hadoop222 hive]# du -h -x --max-depth=1
1.6M ./examples
104K ./docs
24K ./conf
2.0M ./scripts
20M ./hcatalog
7.8M ./auxlib
113M ./lib
160K ./bin
32K ./data
0 ./warehouse
145M .
//查询yarn节点的工作状态
[root@hadoop222 hive]# yarn node -list all
21/06/07 12:03:17 INFO client.RMProxy: Connecting to ResourceManager at hadoop222/192.168.146.222:8032
21/06/07 12:03:17 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Total Nodes:1
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
hadoop222:36360 RUNNING hadoop222:8042 0
//关联hbase里面的users表到hive (外部表)
create external table ${db}.hb_users(user_id String, birth_year INT,gender String,locale String,location String,time_zone String,joined_at String)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' with serdeproperties ('hbase.columns.mapping'=':key,profile:birthyear,profile:gender,region:locale,region:location,region:timezone,registration:joinedAt')
tblproperties ('hbase.table.name'='events_db:users');
// 创建内部表 users 将hb_users外部数据 以orc的格式存储到hive内部表users中
// 提高了读取效率
create table ${db}.users
stored as orc as
select * from ${db}.hb_users;
// 删除外部表 , 因为生成内部表后就不需要外部表了
drop table if exists ${db}.hb_users;
其他的表都是一样的重复操作
//继续创建 events表的映射
create external table ${db}.hb_events(
event_id String,
user_id String,
event_time String,
city String,
state String,
zip String,
country String,
lat Float,
lng Float,
common_words String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' with serdeproperties
('hbase.columns.mapping'='
:key,
creator:user_id,
schedule:start_time,
location:city,
location:state,
location:zip,
location:country,
location:lat,
location:lng,
remark:common_words
')
tblproperties ('hbase.table.name'='events_db:events');
//根据events外部表生成内部表
create table ${db}.events
stored as orc as
select * from ${db}.hb_events;
// 删除外部表 , 因为生成内部表后就不需要外部表了
drop table if exists ${db}.hb_events;
//user_friend 外部表
create external table ${db}.hb_user_friend(
row_key String,
user_id String,
friend_id String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' with serdeproperties
('hbase.columns.mapping'='
:key,
uf:userid,
uf:friendid
')
tblproperties ('hbase.table.name'='events_db:user_friend');
//user_friend 内部表
create table ${db}.user_friend
stored as orc as
select * from ${db}.hb_user_friend;
// 删除外部表 , 因为生成内部表后就不需要外部表了
drop table if exists ${db}.hb_user_friend;
//event_attendees 外部表
create external table ${db}.hb_event_attendee(
row_key String,
event_id String,
user_id String,
attend_type String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' with serdeproperties
('hbase.columns.mapping'='
:key,
euat:eventid,
euat:friendid,
euat:state
')
tblproperties ('hbase.table.name'='events_db:event_attendee');
//event_attendees 内部表
create table ${db}.event_attendee
stored as orc as
select * from ${db}.hb_event_attendee;
// 删除外部表 , 因为生成内部表后就不需要外部表了
drop table if exists ${db}.hb_event_attendee;
//train 外部表
create external table ${db}.hb_train(
row_key String,
user_id String,
event_id String,
invited String,
timestamp String,
interested String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' with serdeproperties
('hbase.columns.mapping'='
:key,
eu:user,
eu:event,
eu:invited,
eu:timestamp,
eu:interested
')
tblproperties ('hbase.table.name'='events_db:train');
//train 内部表
create table ${db}.train
stored as orc as
select * from ${db}.hb_train;
// 删除外部表 , 因为生成内部表后就不需要外部表了
drop table if exists ${db}.hb_train;