HBase协处理器简介
作用
类似于存储过程和触发器。在服务端运行,减少服务器/客户端IO
类型
类似于存储过程的“终端程序”(EndPoint)
类似于触发器的“观察者”(Observers)
协处理器家族
观察者
RegionObserver:针对region,监听关于region的操作
RegionServerObserver:针对region服务器,监听整个Region服务器的操作
MasterObserver:针对Master服务器的观察者,可以监听Master进行的DDL操作
WALObserver:针对WAL的观察者,可以监听WAL的多有读写操作
BulkLoadObserver:BulkLoad是MapReduce/Spark大规模录入数据到HBase的一种方式,BulkLoadObserver可以监听BulkLoad行为
EndpointObserver:监听Endpoint的执行过程
HBase协处理器实例
观察者实例
现在需要存储学生信息,包括 学号(行键)、姓名、年龄、性别、籍贯(精确到省)、实际录取分数
后期可能需要根据籍贯查找学生,例如查找所有湖北省的学生。这样我们利用协处理器(Observer)来自动生成这个索引表——籍贯到学生的索引
核心代码
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.thinking</groupId>
<artifactId>hello-hbase-coprocessor</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.2.0</version>
</dependency>
</dependencies>
</project>
BirthplaceToStu.java
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.coprocessor.RegionObserver;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.wal.WALEdit;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
public class BirthplaceToStu implements RegionObserver, RegionCoprocessor {
@Override
public Optional<RegionObserver> getRegionObserver() {
RegionObserver observer = this;
return Optional.of(observer);
}
public void prePut(ObserverContext<RegionCoprocessorEnvironment> c, Put put, WALEdit edit, Durability durability) throws IOException {
List<Cell> birthplace = put.get(Bytes.toBytes("basic_info"), Bytes.toBytes("birthplace"));
if (birthplace == null || birthplace.size() == 0) {
System.out.println("----------------------birthplace is empty");
return;
}
String stuId = Bytes.toString(put.getRow());
System.out.println("----------------------stuId=" + stuId);
String sBirthplace = Bytes.toString(CellUtil.cloneValue(birthplace.get(0)));
Put index = new Put(Bytes.toBytes(sBirthplace));
index.addColumn(Bytes.toBytes("birthplace_stu_info"), Bytes.toBytes(stuId), Bytes.toBytes("1"));
Connection connection = c.getEnvironment().getConnection();
TableName tableName = TableName.valueOf("birthplace_stu_index");
Table table = connection.getTable(tableName);
table.put(index);
table.close();//这样干效率不高
}
}
/*
$ hdfs dfs -mkdir -p /usr/alex
$ hdfs dfs -put /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT.jar /usr/alex/
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '/usr/alex/hello-hbase-coprocessor-1.0-SNAPSHOT.jar|BirthplaceToStu||'
> enable 'student'
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
> scan 'student'
> scan 'birthplace_stu_index'
> disable 'student'
> alter 'student', METHOD => 'table_att_unset', NAME => 'coprocessor$1'
> enable 'student'
> hdfs dfs -rm /usr/alex/hello-hbase-coprocessor-1.0-SNAPSHOT.jar
*/
终端程序实例
还是上文的那个场景,现在需要求各个省份学生实际录取分数的最大/最小/平均值。
上文虽然建立了索引省份-->学生。很容易找出各个省份的所有学生,然后可以据此计算出各个省份学生实际录取分数的最大/最小/平均值。但是具体计算最大/最小/平均值时,势必要把大量原始数据传输到客户端(虽然结合过滤器,可以滤掉不必要的列数据,以及一些无效行数据,但是需要计算的原始数据依然可能很大)。
这个时候终端程序就派上了用场
核心代码
服务端
pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.thinking</groupId>
<artifactId>hello-hbase-coprocessor</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.2.0</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.protobuf/protobuf-java -->
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>3.9.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.1</version>
<configuration>
<archive>
<manifest>
<mainClass>SparkReadHBase</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
FractionalInfoService.proto文件
option java_package = "com.thinking.grpc.proto";
option java_outer_classname = "StuFractionalProto";
option java_generic_services = true;
option java_generate_equals_and_hash = true;
option optimize_for = SPEED;
message GetFractionalInfoReq {
required string place = 1;
required string type = 2;
}
message GetFractionalInfoResp {
required string place = 1;
required string type = 2;
required float fractional = 3;
}
service FractionalInfoService {
rpc GetFractionalInfo (GetFractionalInfoReq) returns (GetFractionalInfoResp);
}
//$ cd /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor
//$ protoc -Isrc/main/resources --java_out=src/main/java src/main/resources/FractionalInfoService.proto
FractionalInfoEndpoint.java文件
package com.thinking.grpc.impl;
import com.google.protobuf.RpcCallback;
import com.google.protobuf.RpcController;
import com.google.protobuf.Service;
import com.thinking.grpc.proto.StuFractionalProto;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.shaded.protobuf.ResponseConverter;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public class FractionalInfoEndpoint extends StuFractionalProto.FractionalInfoService implements RegionCoprocessor {
private RegionCoprocessorEnvironment environment;
@Override
public void start(CoprocessorEnvironment env) throws IOException {
System.out.println("----doing start----");
environment = (RegionCoprocessorEnvironment) env;
}
@Override
public void stop(CoprocessorEnvironment env) throws IOException {
}
@Override
public Iterable<Service> getServices() {
Service service = this;
return Collections.singleton(service);
}
@Override
public void getFractionalInfo(RpcController controller, StuFractionalProto.GetFractionalInfoReq request, RpcCallback<StuFractionalProto.GetFractionalInfoResp> done) {
String type = request.getType();
String palce = request.getPlace();
StuFractionalProto.GetFractionalInfoResp.Builder result = StuFractionalProto.GetFractionalInfoResp.newBuilder().setPlace(palce).setType(type);
System.out.println("FractionalInfoEndpoint req-->" + type + "-->" + palce);
try {
if (type.equals("MAX")) {
System.out.println("do max");
result.setFractional(getMax(palce));
} else if (type.equals("MIN")) {
result.setFractional(getMin(palce));
} else if (type.equals("AVG")) {
result.setFractional(getAvg(palce));
}
} catch (IOException e) {
System.out.println("error-->" + e.getMessage());
ResponseConverter.setControllerException(controller, e);
}
done.run(result.build());
}
private Result[] getvalue(String place) throws IOException {
System.out.println("doing getvalue");
TableName tabPlaceStusName = TableName.valueOf("birthplace_stu_index");
Table tabPlaceStus = null;
try {
System.out.println("environment-->" + (environment == null ? "null" : "not null"));
System.out.println("environment.getConnection()-->" + (environment.getConnection() == null ? "null" : "not null"));
tabPlaceStus = environment.getConnection().getTable(tabPlaceStusName);
} catch (Exception e) {
System.out.println("get tab birthplace_stu_index error-->" + e.getMessage());
}
if (tabPlaceStus == null) {
return null;
}
System.out.println("get tab birthplace_stu_index success");
Get getPalceStus = new Get(Bytes.toBytes(place));
Result result = tabPlaceStus.get(getPalceStus);
Map<byte[], byte[]> stusMap = result.getFamilyMap(Bytes.toBytes("birthplace_stu_info"));
List<Get> getStuList = new ArrayList();
System.out.println("get stu set");
for (Map.Entry<byte[], byte[]> stu : stusMap.entrySet()) {
System.out.println("get stu set-->" + Bytes.toString(stu.getKey()));
getStuList.add(new Get(stu.getKey()));
}
tabPlaceStus.close();
TableName tabStuName = TableName.valueOf("student");
Table tabStu = environment.getConnection().getTable(tabStuName);
Result[] results = tabStu.get(getStuList);//可以看出这种方式在大数据量下,内存压力可能会很大。这就是采用mapreduce或者spark的原因之一
tabStu.close();
return results;
}
private float getMax(String place) throws IOException {
System.out.println("doing max");
Result[] results = getvalue(place);
float max = 0;
for (Result result : results) {
float value = Float.parseFloat(Bytes.toString(result.getValue(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"))));
if (value > max) {
max = value;
}
}
System.out.println("get max --> " + max);
return max;
}
private float getMin(String place) throws IOException {
Result[] results = getvalue(place);
float min = Float.MAX_VALUE;
for (Result result : results) {
float value = Float.parseFloat(Bytes.toString(result.getValue(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"))));
if (value < min) {
min = value;
}
}
return min;
}
private float getAvg(String place) throws IOException {
Result[] results = getvalue(place);
float total = 0;
for (Result result : results) {
float value = Float.parseFloat(Bytes.toString(result.getValue(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"))));
total += value;
}
return total / results.length;
}
}
/*
####连同BirthplaceToStu一起采用静态部署
$ stop-hbase.sh
$ rm -rf $HBASE_HOME/lib/hell*
$ cp -r /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT-jar-with-dependencies.jar $HBASE_HOME/lib/
$ gedit $HBASE_HOME/conf/hbase-site.xml
<property>
<name>hbase.coprocessor.user.region.classes</name>
<value>BirthplaceToStu</value>
</property>
$ rm -rf $HBASE_HOME/logs/
$ start-hbase.sh
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
####不必再用shell命令加载,配置文件里面的协处理器一律全部启用
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
####此时在此处出错,后面解决,先绕过
*/
/*
####com.thinking.grpc.impl.FractionalInfoEndpoint静态部署,BirthplaceToStu动态部署
$ stop-hbase.sh
$ rm -rf $HBASE_HOME/lib/hell*
$ cp -r /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT-jar-with-dependencies.jar $HBASE_HOME/lib/
$ gedit $HBASE_HOME/conf/hbase-site.xml
# <property>
# <name>hbase.coprocessor.user.region.classes</name>
# <value>BirthplaceToStu</value>
# </property>
$ rm -rf $HBASE_HOME/logs/
$ start-hbase.sh
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '|BirthplaceToStu||'
> enable 'student'
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
> put 'student','stu-100001','basic_info:fractional','555'
> put 'student','stu-100002','basic_info:birthplace','jiangxi'
> put 'student','stu-100003','basic_info:fractional','556'
> scan 'student'
> scan 'birthplace_stu_index'
> exit
$ stop-hbase.sh
$ gedit $HBASE_HOME/conf/hbase-site.xml
<property>
<name>hbase.coprocessor.user.region.classes</name>
<value>com.thinking.grpc.impl.FractionalInfoEndpoint</value>
</property>
$ start-hbase.sh
*/
/*
####也可以动态部署
$ stop-hbase.sh
$ rm -rf $HBASE_HOME/lib/hell*
$ cp -r /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT-jar-with-dependencies.jar $HBASE_HOME/lib/
$ gedit $HBASE_HOME/conf/hbase-site.xml
# <property>
# <name>hbase.coprocessor.user.region.classes</name>
# <value>BirthplaceToStu</value>
# </property>
$ rm -rf $HBASE_HOME/logs/
$ start-hbase.sh
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '|BirthplaceToStu||'
> enable 'student'
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
> put 'student','stu-100001','basic_info:fractional','555'
> put 'student','stu-100002','basic_info:birthplace','jiangxi'
> put 'student','stu-100003','basic_info:fractional','556'
> scan 'student'
> scan 'birthplace_stu_index'
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '|com.thinking.grpc.impl.FractionalInfoEndpoint||'
> enable 'student'
> describe 'student'
*/
有些较老的资料上说FractionalInfoEndpoint implements Coprocessor, CoprocessorService,这样做发现在HBase 2.1.5上加载endpoint后,public void start(CoprocessorEnvironment env) throws IOException不执行,导致RegionCoprocessorEnvironment environment为空
客户端
pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.thinking</groupId>
<artifactId>hello-hbase-coprocessor-client</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.protobuf/protobuf-java -->
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>3.9.0</version>
</dependency>
</dependencies>
</project>
Main.java文件
import com.thinking.grpc.proto.StuFractionalProto;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.coprocessor.Batch;
import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.Map;
public class Main {
/*
* $ cd src/main/resources
* $ mkdir hbase
* $ cp -r $HBASE_HOME/conf/hbase-site.xml hbase/
*/
public static void main(String[] args) throws Throwable {
Configuration configuration = HBaseConfiguration.create();
Connection connection = ConnectionFactory.createConnection(configuration);
System.out.println("conn success!");
TableName tableName = TableName.valueOf("student");
Table table = connection.getTable(tableName);
Put putBen = new Put(Bytes.toBytes("stu_001"));
putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("name"), Bytes.toBytes("ben"));
putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("gen"), Bytes.toBytes("M"));
putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("age"), Bytes.toBytes("22"));
putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("birthplace"), Bytes.toBytes("hubei"));
putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"), Bytes.toBytes("555"));
table.put(putBen);
StuFractionalProto.GetFractionalInfoReq req = StuFractionalProto.GetFractionalInfoReq.newBuilder().setPlace("hubei").setType("MAX").build();
Map<byte[], Float> result = table.coprocessorService(StuFractionalProto.FractionalInfoService.class, null, null, new Batch.Call<StuFractionalProto.FractionalInfoService, Float>() {
@Override
public Float call(StuFractionalProto.FractionalInfoService instance) throws IOException {
CoprocessorRpcUtils.BlockingRpcCallback<StuFractionalProto.GetFractionalInfoResp> callback = new CoprocessorRpcUtils.BlockingRpcCallback();
instance.getFractionalInfo(null, req, callback);
return callback.get().getFractional();
}
});
for (byte[] bytes : result.keySet()) {
System.out.println(Bytes.toString(bytes) + "-->" + result.get(bytes));
}
}
}
客户端运行结果
conn success!
student,,1563956471472.eaaac0653cb76856c8fe3f75038cdccb.-->555.0
疑问
问:协处理器,特别是终端程序可以做的事情,Spark也可以在做。两者各有何优劣势?什么情况下该使用协处理器,什么情况下该使用Spark?
答:协处理器运行在RegionServer进程空间内,为了保证Region对常规读/写任务的支持,协处理器不宜做太大规模的遍历任务。类似于全表遍历,join操作等,宜用MR或者spark。