#主键
rowkey设计-+
sqoop ->hbase
hive ->hbase
java api ->hbase
console ->hbase
#控制台建立访 问接口
hbase shell
#存储结构
库:命名空间 namespace
表: table
+列:列簇 column family
htable 和region的关系 类似于hdfs File和block
HMaster职责
1. 管理用户对Table的增、删、改、查操作
2. 管理HRegionServer的负载均衡,调整Region分布
3. 在Region Split后,负责新Region的分配
4. 在HRegionServer停机后,负责失效HRegionServer 上的Regions迁移(容错)
HRegionServer:Write
HRegion*N
HRegion1
menStore=>cache
StoreFiles =阈值=>StoreFile=>阈值=>split
HRegion2
...
HLOg(WAL=>Write Ahead Log)
HMaster-迁移->HRegionServer2->....
↓
|-in->Hlog(备份)----------------out-----------------|
|--------------------------------------------
client-write data-> |HRegionServer-x->HRegion->MemStore->...溢写|
|--------------------------------------------
#操作hbase
#只有新增操作,通过操作类型和时间戳建立版本管理,按照时间的先后顺序0,1,2
#可以通过切货版本追踪历史数据
#可以设置最大的保留实例记录数
#容错机制
#hbase写的快的原因:数据写入MemStore(内存),就返回客户端;后面的溢写,合并,分裂都是延迟执行的。
#hbase读的快的原因:将基本数据缓存在blockcache里,通过bloomfilter可以快速过滤不需要的块(可以快速判断数据在不在blockcache里),如果数据在blockcache里,就可以快速提取
#hbase客户端
#DDL
#查看所有的表
list
#查看命名空间
list_namespace
#创建命名空间
create_namespace 'kb16nb'
#指定命名空间,查看命名空间下的表
list_namespace_tables 'kb16nb'
#建表
create 'kb16nb:student','base','bigdata','cloud'
#查看表的定义
describe 'kb16nb:student'
#完整建表方式
create 'kb16nb:student',
{NAME => 'base', BLOOMFILTER => 'ROW', IN_MEMORY => 'true', VERSIONS => '1', KEEP_DELETED_CELLS => 'FAL
SE', DATA_BLOCK_ENCODING => 'NONE', COMPRESSION => 'NONE', TTL => 'FOREVER', MIN_VERSIONS => '0', BLOCKC
ACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'},
{NAME => 'bigdata', BLOOMFILTER => 'ROW', IN_MEMORY => 'false', VERSIONS => '1', KEEP_DELETED_CELLS => 'FAL
SE', DATA_BLOCK_ENCODING => 'NONE', COMPRESSION => 'NONE', TTL => 'FOREVER', MIN_VERSIONS => '0', BLOCKC
ACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'},
{NAME => 'cloud', BLOOMFILTER => 'ROW', IN_MEMORY => 'false', VERSIONS => '1', KEEP_DELETED_CELLS => 'FAL
SE', DATA_BLOCK_ENCODING => 'NONE', COMPRESSION => 'NONE', TTL => 'FOREVER', MIN_VERSIONS => '0', BLOCKC
ACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'};
#BLOOMFILTER:NONE,ROW(default),or ROWCOL
#bloomfilter可以快速判断数据在不在blockcache里,快速过滤不需要的块!(hfile(storefile))
#经常查整行,ROWCOL不推荐使用,ROWCOL需要更多的内存空间,ROW兼容ROWCOL,反之不行
#经常删除不能用,设置为NONE,因为需要重构bloomfilter
#IN_MEMORY
#blockcache(LruBlockCache默认开启)的设计,包括三种优先级:
#Single Access:当数据第一次从HDFS上读取时为这种优先级,需要回收空间时优先清除
#Multi Access:Single数据块再次访问时升级为Multi Access优先级,次要被考虑的范围
#In-memory Access:指定为“IN_MEMORY”具有这种优先级,与访问次数无关,最后清除。
#JavaAPI:HColumnDescriptor.setInMemory(true);
#建表时:例如: create 't', {NAME=>'f',IN_MEMORY=>'true'}
is_enabled 'kb16nb:student' #是否启用
is_disabled 'kb16nb:student' #是否禁用
enable 'kb16nb:student' #启用表
disable 'kb16nb:student' #禁用表
drop 'kb16nb:student' #删除表(表必须已禁用)
#DML
truncate 'kb16nb:student' #删除数据
delete 'kb16nb:student','rowkey' #删除行的最新版本
deleteall 'kb16nb:student','rowkey' #删除行的所有版本
delete 'kb16nb:student','rowkey','columnfamily:colname' #删除某行 某列的最新版本
deleteall 'kb16nb:student','rowkey','columnfamily:colname' #删除某行某列的所有版本
put 'kb16nb:student','1','base:name','zhangsan'
scan 'kb16nb:student' #查询全表数据
count 'kb16nb:student' #查询数据量
最直接的方式是在hbase shell中执行count的命令可以统计行数。
hbase> count ‘t1′
hbase> count ‘t1′, INTERVAL => 100000
hbase> count ‘t1′, CACHE => 1000
hbase> count ‘t1′, INTERVAL => 10, CACHE => 1000
#其中,INTERVAL为统计的行数间隔,默认为1000,CACHE为统计的数据缓存。这种方式效率很低,如果表行数很大的话不建议采用这种方式
get 'kb16nb:student','2','base:name'
scan 'kb16nb:student',{COLUMN=>'base'}
scan 'kb16nb:student',{COLUMN=>'base:name'}
scan 'kb16nb:student',{COLUMN=>'base:name',LIMIT=>2,STARTROW=>'2'}
scan 'kb16nb:student',{
COLUMN=>'base:name',
LIMIT=>3,
STARTROW=>'2', #(包含)
STOPROW=>'4'} #(不包含)
put 'kb16nb:student','1','base:name','zhangsan'
put 'kb16nb:student','2','base:name','lisa'
put 'kb16nb:student','3','base:name','jack'
put 'kb16nb:student','4','base:name','rose'
put 'kb16nb:student','5','base:name','jane'
put 'kb16nb:student','1','base:age',18
put 'kb16nb:student','2','base:age',19
put 'kb16nb:student','3','base:age',20
put 'kb16nb:student','4','base:age',21
put 'kb16nb:student','5','base:age',22
put 'kb16nb:student','1','base:product','bigdata'
put 'kb16nb:student','2','base:product','cloud'
put 'kb16nb:student','3','base:product','bigdata'
put 'kb16nb:student','4','base:product','cloud'
put 'kb16nb:student','5','base:product','bigdata'
put 'kb16nb:student','1','bigdata:hive',88
put 'kb16nb:student','3','bigdata:hive',76
put 'kb16nb:student','5','bigdata:hive',65
put 'kb16nb:student','1','bigdata:hbase',73
put 'kb16nb:student','3','bigdata:hbase',71
put 'kb16nb:student','5','bigdata:hbase',78
put 'kb16nb:student','2','cloud:net',77
put 'kb16nb:student','4','cloud:net',66
put 'kb16nb:student','2','cloud:shell',74
put 'kb16nb:student','4','cloud:shell',68
#在shell 脚本中执行
echo "list_namespace_tables 'kb16nb'"|hbase shell -n
echo "list" |hbase shell -n
echo "describe 'kb16nb:student'"|hbase shell -n
#在hive上建表,关联hbase!
create external table kb16.hive_map_hbase_01(
stuid int,
stuname string,
stuage int,
product string
)stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ("hbase.columns.mapping"=":key,base:name,base:age,base:product")
tblproperties("hbase.table.name"="kb16nb:student");
#布隆过滤器的误差率的内部处理问题
#java api 操作 hbase (主要批量存放数据)
#创建配置对象,创建连接对象
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum","single01:2181 ");
Connection hbaseCon=ConnectionFactory.createConnection(config);
//操作对象
Admin admin=hbaseCon.getAdmin();
admin.xxx(tableName)
//创建表名对象
final String HTable="kb16nb:student";
TableName tableName = TableName.valueOf(HTable);
//操作数据
Table table=hbaseCon.getTable(tableName);
//单行添加数据
Put row = new Put(byte[] rowkey);
row.addColumn(byte[] columnfamily,byte[] column,byte[] value)
...
table.put(row);
//多行(少量)
List<Put> rows =new ArrayList();
rows.add(row);
...
table.put(rows);
//批处理
//lambada 创建hbase 批量插入数据异常侦听对象
//java中函数式接口,可以用lambda表达式写
BufferedMutator.ExceptionListener listener=(e,mutator)->{
//异常信息(原因)
String msg = e.getMessage();
//出异常的行数
int numExceptions = e.getNumExceptions();
//记录出异常的行的行键,以便事后检查并再处理
//用Log4j记录
logger.error("HBASE MUTATE EXCEPTION : "+msg+","+numExceptions);
if(numExceptions>0){
StringBuilder builder=new StringBuilder();
builder.append(Bytes.toString(e.getRow(0).getRow()));
final String SEP=",";
for (int i = 0; i <numExceptions ; i++) {
builder.append(SEP);
builder.append(Bytes.toString(e.getRow(i).getRow()));
}
logger.error(builder.toString());
}
};
final int BUFFER_SIZE=8*1024;
BufferedMutatorParams bmp=new BufferedMutatorParams(tableName)
.listener(listener).writeBufferSize(BUFFER_SIZE);
BufferedMutator mutator=hbaseCon.getBufferedMutator(bmp);
//创建List
List<Put> list=new ArrayList<>(BUFFER_SIZE);
...
//放入数据
mutator.mutate(list);
#log4j日志
#1导入jar包
#2做配置信息 resources/log4j.properties
log4j.rootLogger=INFO, stdout, logfile
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=log/hd.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
#3创建对象
private static final Logger logger=Logger.getLogger(App.class);
#优化
预分区 Pre-aplitting tables
#n+1
create_namespace 'kb16'
create 'kb16:pre_split_n1','cf',SPLITS=>['10','20','30']
/hbase/data/kb16/pre_split_n1
Permission Owner Group Size Last Modified Replication Block Size Name
drwxr-xr-x root supergroup 0 B Jan 30 09:25 0 0 B .tabledesc
drwxr-xr-x root supergroup 0 B Jan 30 09:25 0 0 B .tmp
drwxr-xr-x root supergroup 0 B Jan 30 09:25 0 0 B 57ee8f6d69fc35bd5ded359b4a0aeb1b
drwxr-xr-x root supergroup 0 B Jan 30 09:25 0 0 B 7427fc9e45153f8c118f806abf722884
drwxr-xr-x root supergroup 0 B Jan 30 09:25 0 0 B dae423145aece1af11340aa519b63f98
drwxr-xr-x root supergroup 0 B Jan 30 09:25 0 0 B ec85df19801bcfc46a7b3da0017912f4
#shell
hbase org.apache.hadoop.hbase.util.RegionSplitter kb16:pre_split_hsp1 HexStringSplit -c 3 -f base
#hbase shell
create 'kb16:pre_split_pdp','base',{NUMREGIONS=>4,SPLITALGO=>'DecimalStringSplit'}
#行键设计
越短越好,16字节以内
随机数
UUID
MD5
业务有序数据(自增主键)
考虑分区策略
DelimitedKeyPrefixRegionSplitPolicy 343_f34343
KeyPrefixRegionSplitPolicy