Apache HBase
√Java API
需要在maven工程中导入如下依赖:
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.4</version>
</dependency>
在链接HBase服务的时候,我们需要创建Connection对象,该对象主要负责实现数据的DML(data manipulation language:它们是SELECT、UPDATE、INSERT、DELETE,就象它的名字一样,这4条命令是用来对数据库里的数据进行操作的语言 )
,如果用户需要执行DDL(data definition language:DDL比DML要多,主要的命令有CREATE、ALTER、DROP等,DDL主要是用在定义或改变表(TABLE)的结构,数据类型,表之间的链接和约束等初始化工作上,他们大多在建立表时使用
)指令需要创建Admin对象,如果需要执行DML创建Table对象
public class HBaseDDLTest {
private Admin admin;
private Connection conn;
private Table table;
@Before
public void before() throws IOException {
Configuration conf= HBaseConfiguration.create();
conf.set(HConstants.ZOOKEEPER_QUORUM,"CentOS");
conf.set(HConstants.ZOOKEEPER_CLIENT_PORT,"2181");
conn= ConnectionFactory.createConnection(conf);
admin=conn.getAdmin();
table=conn.getTable(TableName.valueOf("baizhi:t_user"));
}
@After
public void after() throws IOException {
admin.close();
conn.close();
}
}
常规访问
DDL
1、查看所有的namespace
NamespaceDescriptor[] descriptors = admin.listNamespaceDescriptors();
for (NamespaceDescriptor descriptor : descriptors) {
System.out.println(descriptor.getName());
}
2、创建Namespace
//create_namespace 'zpark' ,{‘creator’=>'zhangsan'}
NamespaceDescriptor namespaceDescriptor=NamespaceDescriptor.create("zpark")
.addConfiguration("creator","zhangsan")
.build();
admin.createNamespace(namespaceDescriptor);
3、修改Namespace
//alter_namespace 'zpark' ,{METHOD=>'unset',NAME=>'creator'}
NamespaceDescriptor namespaceDescriptor=NamespaceDescriptor.create("zpark")
.removeConfiguration("creator")
.build();
admin.modifyNamespace(namespaceDescriptor);
4、查看namespace下的表
//list_namespace_tables 'baizhi'
TableName[] tables = admin.listTableNamesByNamespace("baizhi");
for (TableName tableName : tables) {
System.out.println(tableName.getNameAsString());
}
5、删除namespace
//drop_namespace 'zpark'
admin.deleteNamespace("zpark");
6、创建table
//create 'zpark:t_user',{NAME=>'cf1',VERSIONS=>3,IN_MEMORY=>true,BLOOMFILTER=>'ROWCOL'},{NAME=>'cf2',TTL=>60}
HTableDescriptor tableDescriptor=new HTableDescriptor(TableName.valueOf("zpark:t_user"));
HColumnDescriptor cf1=new HColumnDescriptor("cf1");
cf1.setMaxVersions(3);
cf1.setInMemory(true);
cf1.setBloomFilterType(BloomType.ROWCOL);
HColumnDescriptor cf2=new HColumnDescriptor("cf2");
cf2.setTimeToLive(60);
tableDescriptor.addFamily(cf1);
tableDescriptor.addFamily(cf2);
admin.createTable(tableDescriptor);
7、删除Table
//disable 'zpark:t_user'
//drop 'zpark:t_user'
TableName tableName = TableName.valueOf("zpark:t_user");
boolean exists = admin.tableExists(tableName);
if(!exists){
return;
}
boolean disabled = admin.isTableDisabled(tableName);
if(!disabled){
admin.disableTable(tableName);
}
admin.deleteTable(tableName);
8、截断表
TableName tableName = TableName.valueOf("baizhi:t_user");
boolean disabled = admin.isTableDisabled(tableName);
if(!disabled){
admin.disableTable(tableName);
}
admin.truncateTable(tableName,false);
DML
1、插入/更新-put
String[] depts=new String[]{"search","sale","manager"};
for(Integer i=0;i<=1000;i++){
DecimalFormat format = new DecimalFormat("0000");
String rowKey=format.format(i);
Put put=new Put(toBytes(rowKey));
put.addColumn(toBytes("cf1"),toBytes("name"),toBytes("user"+rowKey));
put.addColumn(toBytes("cf1"),toBytes("salary"),toBytes(100.0 * i));
put.addColumn(toBytes("cf1"),toBytes("dept"),toBytes(depts[new Random().nextInt(3)]));
table.put(put);
}
String[] depts=new String[]{"search","sale","manager"};
//实现批量更新、修改
BufferedMutator bufferedMutator = conn.getBufferedMutator(TableName.valueOf("baizhi:t_user"));
for(Integer i=1000;i<=2000;i++){
DecimalFormat format = new DecimalFormat("0000");
String rowKey=format.format(i);
Put put=new Put(toBytes(rowKey));
put.addColumn(toBytes("cf1"),toBytes("name"),toBytes("user"+rowKey));
put.addColumn(toBytes("cf1"),toBytes("salary"),toBytes(100.0 * i));
put.addColumn(toBytes("cf1"),toBytes("dept"),toBytes(depts[new Random().nextInt(3)]));
bufferedMutator.mutate(put);
if(i%500==0 && i>1000){//执行刷新
bufferedMutator.flush();
}
}
bufferedMutator.close();
2、查询某一行(含有多个Cell)-get
Get get=new Get(toBytes("2000"));
Result result = table.get(get);//一行记录,包含多个Cell
byte[] bname = result.getValue(toBytes("cf1"), toBytes("name"));
byte[] bdept = result.getValue(toBytes("cf1"), toBytes("dept"));
byte[] bsalary = result.getValue(toBytes("cf1"), toBytes("salary"));
String name= Bytes.toString(bname);
String dept= Bytes.toString(bdept);
Double salary= Bytes.toDouble(bsalary);
System.out.println(name+" "+dept+" "+salary);
获取Result的Cell方式有很多,其中getValue方法最常用,用户必须制定Column信息。对于Result遍历可以使用CellScanner或者listCells方法
Get get=new Get(toBytes("2000"));
Result result = table.get(get);//一行记录,包含多个Cell
CellScanner cellScanner = result.cellScanner();
while (cellScanner.advance()){
Cell cell = cellScanner.current();
//获取Cell的列名字
String qualifier = Bytes.toString(cloneQualifier(cell));
//获取值
Object value=null;
if(qualifier.equals("salary")){
value=toDouble(cloneValue(cell));
}else{
value=Bytes.toString(cloneValue(cell));
}
//获取RowKey
String rowKey=Bytes.toString(cloneRow(cell));
System.out.println(rowKey+" "+qualifier+" "+value);
}
Get get=new Get(toBytes("2000"));
Result result = table.get(get);//一行记录,包含多个Cell
List<Cell> cells = result.listCells();
for (Cell cell : cells) {
//获取Cell的列名字
String qualifier = Bytes.toString(cloneQualifier(cell));
//获取值
Object value=null;
if(qualifier.equals("salary")){
value=toDouble(cloneValue(cell));
}else{
value=Bytes.toString(cloneValue(cell));
}
//获取RowKey
String rowKey=Bytes.toString(cloneRow(cell));
System.out.println(rowKey+" "+qualifier+" "+value);
}
用户还可以使用getColumnCells方法获取某个Cell的多个版本的数据
Get get=new Get(toBytes("2000"));
get.setMaxVersions(3);
get.setTimeStamp(1602299440060L);
Result result = table.get(get);//一行记录,包含多个Cell
List<Cell> salaryCells = result.getColumnCells(toBytes("cf1"), toBytes("salary"));
for (Cell salaryCell : salaryCells) {
System.out.println(toDouble(cloneValue(salaryCell)));
}
3、给某个Cell增加值-incr
Increment increment=new Increment(toBytes("2000"));
increment.addColumn(toBytes("cf1"),toBytes("salary"),1000L);
table.increment(increment);
4、删除数据-delete/deleteall
- deleteall
Delete delete=new Delete(toBytes("2000"));
table.delete(delete);
- delete
Delete delete=new Delete(toBytes("2000"));
delete.addColumn(toBytes("cf1"),toBytes("salary"));
table.delete(delete);
5、表扫描-scan
Scan scan = new Scan();
ResultScanner scanner = table.getScanner(scan);
Iterator<Result> resultIterator = scanner.iterator();
while (resultIterator.hasNext()){
Result result = resultIterator.next();
byte[] bname = result.getValue(toBytes("cf1"), toBytes("name"));
byte[] bdept = result.getValue(toBytes("cf1"), toBytes("dept"));
byte[] bsalary = result.getValue(toBytes("cf1"), toBytes("salary"));
String name= Bytes.toString(bname);
String dept= Bytes.toString(bdept);
Double salary= Bytes.toDouble(bsalary);
String rowKey=Bytes.toString(result.getRow());
System.out.println(rowKey+" " +name+" "+dept+" "+salary);
}
我们可以尝试配置Scan对象定制查询条件,完成复查查询需求
Scan scan = new Scan();
scan.setStartRow(toBytes("1000"));
scan.setStopRow(toBytes("1100"));
//scan.setRowPrefixFilter(toBytes("108"));
Filter filter1=new RowFilter(CompareFilter.CompareOp.EQUAL,new RegexStringComparator("09$"));
Filter filter2=new RowFilter(CompareFilter.CompareOp.EQUAL,new SubstringComparator("80"));
FilterList filter=new FilterList(FilterList.Operator.MUST_PASS_ONE,filter1,filter2);
scan.setFilter(filter);
ResultScanner scanner = table.getScanner(scan);
Iterator<Result> resultIterator = scanner.iterator();
while (resultIterator.hasNext()){
Result result = resultIterator.next();
byte[] bname = result.getValue(toBytes("cf1"), toBytes("name"));
byte[] bdept = result.getValue(toBytes("cf1"), toBytes("dept"));
byte[] bsalary = result.getValue(toBytes("cf1"), toBytes("salary"));
String name= Bytes.toString(bname);
String dept= Bytes.toString(bdept);
Double salary= Bytes.toDouble(bsalary);
String rowKey=Bytes.toString(result.getRow());
System.out.println(rowKey+" " +name+" "+dept+" "+salary);
}
更多Filter参考:https://www.jianshu.com/p/bcc54f63abe4
MapReduce集成
HBase提供了和MapReduce框架集成输入和输出格式TableInputFormat/TableOutputFormat实现。用户只需要按照输入和输出格式定制代码即可。
这里需要注意,由于使用了TableInputFormat所需在任务提交初期,程序需要计算任务的切片信息,因此需要在提交节点上配置HABASE的类路径
[root@CentOS ~]# vi .bashrc
JAVA_HOME=/usr/java/latest
HADOOP_HOME=/usr/hadoop-2.9.2/
HBASE_HOME=/usr/hbase-1.2.4/
PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HBASE_HOME/bin
CLASSPATH=.
export JAVA_HOME
export PATH
export CLASSPATH
export HADOOP_HOME
export HBASE_HOME
HBASE_CLASSPATH=$(/usr/hbase-1.2.4/bin/hbase classpath)
HADOOP_CLASSPATH=$HBASE_CLASSPATH:/root/mysql-connector-java-5.1.49.jar
export HADOOP_CLASSPATH
[root@CentOS ~]# source .bashrc
public class AvgSalaryApplication extends Configured implements Tool {
public int run(String[] strings) throws Exception {
Configuration conf=getConf();
conf= HBaseConfiguration.create(conf);
conf.set(HConstants.ZOOKEEPER_QUORUM,"CentOS");
conf.setBoolean("mapreduce.map.output.compress",true);
conf.setClass("mapreduce.map.output.compress.codec", GzipCodec.class, CompressionCodec.class);
Job job= Job.getInstance(conf,"AvgSalaryApplication");
job.setJarByClass(AvgSalaryApplication.class);
job.setInputFormatClass(TableInputFormat.class);
job.setOutputFormatClass(TableOutputFormat.class);
TableMapReduceUtil.initTableMapperJob(
"baizhi:t_user",new Scan(),AvgSalaryMapper.class,
Text.class,
DoubleWritable.class,
job
);
TableMapReduceUtil.initTableReducerJob(
"baizhi:t_result",
AvgSalaryReducer.class,
job
);
job.setNumReduceTasks(3);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new AvgSalaryApplication(),args);
}
}
public class AvgSalaryMapper extends TableMapper<Text, DoubleWritable> {
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
String dept= Bytes.toString(value.getValue(Bytes.toBytes("cf1"),Bytes.toBytes("dept")));
Double salary= Bytes.toDouble(value.getValue(Bytes.toBytes("cf1"),Bytes.toBytes("salary")));
context.write(new Text(dept),new DoubleWritable(salary));
}
}
public class AvgSalaryReducer extends TableReducer<Text, DoubleWritable, NullWritable> {
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
double sum=0.0;
int count=0;
for (DoubleWritable value : values) {
count++;
sum+=value.get();
}
Put put = new Put(key.getBytes());
put.addColumn("cf1".getBytes(),"avg".getBytes(),((sum/count)+"").getBytes());
context.write(null,put);
}
}
这里需要注意,默认TableInputFormat计算切片数目等于表的Region的数目。
[root@CentOS ~]# hadoop jar HBase-1.0-SNAPSHOT.jar com.baizhi.mapreduce.AvgSalaryApplication
协处理器(高级)
Hbase 作为列族数据库最经常被人诟病的特性包括:无法轻易建立“二级索引”,难以执 行求和、计数、排序等操作。比如,在旧版本的(<0.92)Hbase 中,统计数据表的总行数,需 要使用 Counter 方法,执行一次 MapReduce Job 才能得到。虽然 HBase 在数据存储层中集成 了 MapReduce,能够有效用于数据表的分布式计算。然而在很多情况下,做一些简单的相 加或者聚合计算的时候,如果直接将计算过程放置在 server 端,能够减少通讯开销,从而获 得很好的性能提升。于是,HBase 在 0.92 之后引入了协处理器(coprocessors),实现一些激动 人心的新特性:能够轻易建立二次索引、复杂过滤器(谓词下推)以及访问控制等。
总体来说其包含两种协处理器:Observers
和Endpoint
Observer
Observer 类似于传统数据库中的触发器,当发生某些事件的时候这类协处理器会被 Server 端调用。Observer Coprocessor 就是一些散布在 HBase Server 端代码中的 hook 钩子, 在固定的事件发生时被调用。比如:put 操作之前有钩子函数 prePut,该函数在 put 操作执 行前会被 Region Server 调用;在 put 操作之后则有 postPut 钩子函数。
1、编写观察者
public class UserAppendObServer extends BaseRegionObserver {
private final static Log LOG= LogFactory.getLog(UserAppendObServer.class);
@Override
public Result preAppend(ObserverContext<RegionCoprocessorEnvironment> e, Append append) throws IOException {
LOG.info("User Append SomeThing ~~~~~~~~~~~~~~");
return null;
}
}
2、将代码打包,上传至HDFS
[root@CentOS ~]# hdfs dfs -mkdir /libs
[root@CentOS ~]# hdfs dfs -put HBase-1.0-SNAPSHOT.jar /libs/
3、启动hbase,并且实时查看RegionServer的启动日志
[root@CentOS ~]# rm -rf /usr/hbase-1.2.4/logs/*
[root@CentOS ~]# start-hbase.sh
[root@CentOS ~]# tail -f /usr/hbase-1.2.4/logs/hbase-root-regionserver-CentOS.log
4、给zpark:t_user
添加协处理器
[root@CentOS ~]# hbase shell
hbase(main):001:0> disable 'zpark:t_user'
hbase(main):003:0> alter 'zpark:t_user' , METHOD =>'table_att','coprocessor'=>'hdfs:///libs/HBase-1.0-SNAPSHOT.jar|com.baizhi.observer.UserAppendObServer|1001'
Updating all regions with the new schema...
1/1 regions updated.
Done.
0 row(s) in 2.0830 seconds
hbase(main):004:0> enable 'zpark:t_user'
0 row(s) in 1.2890 seconds
5、测试监听器是否生效
hbase(main):005:0> desc 'zpark:t_user'
Table zpark:t_user is ENABLED
zpark:t_user, {TABLE_ATTRIBUTES => {coprocessor$1 => 'hdfs:///libs/HBase-1.0-SNAPSHOT.jar|com.baizhi.observer.UserAppendObServer|1001'}
COLUMN FAMILIES DESCRIPTION
{NAME => 'cf1', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLO
CKCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'}
{NAME => 'cf2', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLO
CKCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'}
2 row(s) in 0.0490 seconds
6、尝试执行Append命令,注意观察日志输出
hbase(main):003:0> append 'zpark:t_user','001','cf1:subscribe','002|'
0 row(s) in 0.2140 seconds
2020-10-10 17:23:20,847 INFO [B.defaultRpcServer.handler=3,queue=0,port=16020] observer.UserAppendObServer: User Append SomeThing ~~~~~~~~~~~~~~