HBase协处理器

YongYu_IT

于 2019-07-18 10:19:01 发布

阅读量590

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/YongYu_IT/article/details/96290105

版权

大数据专栏收录该内容

16 篇文章 2 订阅

订阅专栏

HBase协处理器简介

作用

类似于存储过程和触发器。在服务端运行，减少服务器/客户端IO

类型

类似于存储过程的“终端程序”（EndPoint）

类似于触发器的“观察者”（Observers）

协处理器家族

观察者

RegionObserver：针对region，监听关于region的操作

RegionServerObserver：针对region服务器，监听整个Region服务器的操作

MasterObserver：针对Master服务器的观察者，可以监听Master进行的DDL操作

WALObserver：针对WAL的观察者，可以监听WAL的多有读写操作

BulkLoadObserver：BulkLoad是MapReduce/Spark大规模录入数据到HBase的一种方式，BulkLoadObserver可以监听BulkLoad行为

EndpointObserver：监听Endpoint的执行过程

HBase协处理器实例

观察者实例

现在需要存储学生信息，包括学号（行键）、姓名、年龄、性别、籍贯（精确到省）、实际录取分数

后期可能需要根据籍贯查找学生，例如查找所有湖北省的学生。这样我们利用协处理器（Observer）来自动生成这个索引表——籍贯到学生的索引

核心代码

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.thinking</groupId>
    <artifactId>hello-hbase-coprocessor</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>2.2.0</version>
        </dependency>
    </dependencies>

</project>

BirthplaceToStu.java

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.coprocessor.RegionObserver;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.wal.WALEdit;

import java.io.IOException;
import java.util.List;
import java.util.Optional;

public class BirthplaceToStu implements RegionObserver, RegionCoprocessor {


    @Override
    public Optional<RegionObserver> getRegionObserver() {
        RegionObserver observer = this;
        return Optional.of(observer);
    }

    public void prePut(ObserverContext<RegionCoprocessorEnvironment> c, Put put, WALEdit edit, Durability durability) throws IOException {
        List<Cell> birthplace = put.get(Bytes.toBytes("basic_info"), Bytes.toBytes("birthplace"));
        if (birthplace == null || birthplace.size() == 0) {
            System.out.println("----------------------birthplace is empty");
            return;
        }
        String stuId = Bytes.toString(put.getRow());
        System.out.println("----------------------stuId=" + stuId);
        String sBirthplace = Bytes.toString(CellUtil.cloneValue(birthplace.get(0)));
        Put index = new Put(Bytes.toBytes(sBirthplace));
        index.addColumn(Bytes.toBytes("birthplace_stu_info"), Bytes.toBytes(stuId), Bytes.toBytes("1"));
        Connection connection = c.getEnvironment().getConnection();
        TableName tableName = TableName.valueOf("birthplace_stu_index");
        Table table = connection.getTable(tableName);
        table.put(index);
        table.close();//这样干效率不高
    }
}

/*
$ hdfs dfs -mkdir -p /usr/alex
$ hdfs dfs -put /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT.jar /usr/alex/
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '/usr/alex/hello-hbase-coprocessor-1.0-SNAPSHOT.jar|BirthplaceToStu||'
> enable 'student'
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
> scan 'student'
> scan 'birthplace_stu_index'
> disable 'student'
> alter 'student', METHOD => 'table_att_unset', NAME => 'coprocessor$1'
> enable 'student'
> hdfs dfs -rm /usr/alex/hello-hbase-coprocessor-1.0-SNAPSHOT.jar
*/

终端程序实例

还是上文的那个场景，现在需要求各个省份学生实际录取分数的最大/最小/平均值。

上文虽然建立了索引省份-->学生。很容易找出各个省份的所有学生，然后可以据此计算出各个省份学生实际录取分数的最大/最小/平均值。但是具体计算最大/最小/平均值时，势必要把大量原始数据传输到客户端（虽然结合过滤器，可以滤掉不必要的列数据，以及一些无效行数据，但是需要计算的原始数据依然可能很大）。

这个时候终端程序就派上了用场

核心代码

服务端

pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.thinking</groupId>
    <artifactId>hello-hbase-coprocessor</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>2.2.0</version>
            <scope>provided</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.google.protobuf/protobuf-java -->
        <dependency>
            <groupId>com.google.protobuf</groupId>
            <artifactId>protobuf-java</artifactId>
            <version>3.9.0</version>
        </dependency>

    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.1.1</version>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>SparkReadHBase</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

FractionalInfoService.proto文件

option java_package = "com.thinking.grpc.proto";
option java_outer_classname = "StuFractionalProto";
option java_generic_services = true;
option java_generate_equals_and_hash = true;
option optimize_for = SPEED;

message GetFractionalInfoReq {
    required string place = 1;
    required string type = 2;
}

message GetFractionalInfoResp {
    required string place = 1;
    required string type = 2;
    required float fractional = 3;
}

service FractionalInfoService {
    rpc GetFractionalInfo (GetFractionalInfoReq) returns (GetFractionalInfoResp);
}
//$ cd /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor
//$ protoc -Isrc/main/resources --java_out=src/main/java src/main/resources/FractionalInfoService.proto

FractionalInfoEndpoint.java文件

package com.thinking.grpc.impl;

import com.google.protobuf.RpcCallback;
import com.google.protobuf.RpcController;
import com.google.protobuf.Service;
import com.thinking.grpc.proto.StuFractionalProto;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.shaded.protobuf.ResponseConverter;
import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;

public class FractionalInfoEndpoint extends StuFractionalProto.FractionalInfoService implements RegionCoprocessor {


    private RegionCoprocessorEnvironment environment;

    @Override
    public void start(CoprocessorEnvironment env) throws IOException {
        System.out.println("----doing start----");
        environment = (RegionCoprocessorEnvironment) env;
    }

    @Override
    public void stop(CoprocessorEnvironment env) throws IOException {
    }

    @Override
    public Iterable<Service> getServices() {
        Service service = this;
        return Collections.singleton(service);
    }

    @Override
    public void getFractionalInfo(RpcController controller, StuFractionalProto.GetFractionalInfoReq request, RpcCallback<StuFractionalProto.GetFractionalInfoResp> done) {
        String type = request.getType();
        String palce = request.getPlace();
        StuFractionalProto.GetFractionalInfoResp.Builder result = StuFractionalProto.GetFractionalInfoResp.newBuilder().setPlace(palce).setType(type);
        System.out.println("FractionalInfoEndpoint req-->" + type + "-->" + palce);
        try {
            if (type.equals("MAX")) {
                System.out.println("do max");
                result.setFractional(getMax(palce));
            } else if (type.equals("MIN")) {
                result.setFractional(getMin(palce));
            } else if (type.equals("AVG")) {
                result.setFractional(getAvg(palce));
            }
        } catch (IOException e) {
            System.out.println("error-->" + e.getMessage());
            ResponseConverter.setControllerException(controller, e);
        }

        done.run(result.build());
    }


    private Result[] getvalue(String place) throws IOException {
        System.out.println("doing getvalue");
        TableName tabPlaceStusName = TableName.valueOf("birthplace_stu_index");
        Table tabPlaceStus = null;
        try {
            System.out.println("environment-->" + (environment == null ? "null" : "not null"));
            System.out.println("environment.getConnection()-->" + (environment.getConnection() == null ? "null" : "not null"));
            tabPlaceStus = environment.getConnection().getTable(tabPlaceStusName);
        } catch (Exception e) {
            System.out.println("get tab birthplace_stu_index error-->" + e.getMessage());
        }
        if (tabPlaceStus == null) {
            return null;
        }
        System.out.println("get tab birthplace_stu_index success");
        Get getPalceStus = new Get(Bytes.toBytes(place));
        Result result = tabPlaceStus.get(getPalceStus);
        Map<byte[], byte[]> stusMap = result.getFamilyMap(Bytes.toBytes("birthplace_stu_info"));
        List<Get> getStuList = new ArrayList();
        System.out.println("get stu set");
        for (Map.Entry<byte[], byte[]> stu : stusMap.entrySet()) {
            System.out.println("get stu set-->" + Bytes.toString(stu.getKey()));
            getStuList.add(new Get(stu.getKey()));
        }
        tabPlaceStus.close();

        TableName tabStuName = TableName.valueOf("student");
        Table tabStu = environment.getConnection().getTable(tabStuName);
        Result[] results = tabStu.get(getStuList);//可以看出这种方式在大数据量下，内存压力可能会很大。这就是采用mapreduce或者spark的原因之一
        tabStu.close();
        return results;
    }

    private float getMax(String place) throws IOException {
        System.out.println("doing max");
        Result[] results = getvalue(place);
        float max = 0;
        for (Result result : results) {
            float value = Float.parseFloat(Bytes.toString(result.getValue(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"))));
            if (value > max) {
                max = value;
            }
        }
        System.out.println("get max --> " + max);
        return max;
    }

    private float getMin(String place) throws IOException {
        Result[] results = getvalue(place);
        float min = Float.MAX_VALUE;
        for (Result result : results) {
            float value = Float.parseFloat(Bytes.toString(result.getValue(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"))));
            if (value < min) {
                min = value;
            }
        }
        return min;
    }

    private float getAvg(String place) throws IOException {
        Result[] results = getvalue(place);
        float total = 0;
        for (Result result : results) {
            float value = Float.parseFloat(Bytes.toString(result.getValue(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"))));
            total += value;
        }
        return total / results.length;
    }
}

/*
####连同BirthplaceToStu一起采用静态部署
$ stop-hbase.sh
$ rm -rf $HBASE_HOME/lib/hell*
$ cp -r /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT-jar-with-dependencies.jar $HBASE_HOME/lib/
$ gedit $HBASE_HOME/conf/hbase-site.xml
  <property>
    <name>hbase.coprocessor.user.region.classes</name>
    <value>BirthplaceToStu</value>
  </property>
$ rm -rf $HBASE_HOME/logs/
$ start-hbase.sh
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
####不必再用shell命令加载，配置文件里面的协处理器一律全部启用
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
####此时在此处出错，后面解决，先绕过
*/


/*
####com.thinking.grpc.impl.FractionalInfoEndpoint静态部署，BirthplaceToStu动态部署
$ stop-hbase.sh
$ rm -rf $HBASE_HOME/lib/hell*
$ cp -r /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT-jar-with-dependencies.jar $HBASE_HOME/lib/
$ gedit $HBASE_HOME/conf/hbase-site.xml
    #  <property>
    #    <name>hbase.coprocessor.user.region.classes</name>
    #    <value>BirthplaceToStu</value>
    #  </property>
$ rm -rf $HBASE_HOME/logs/
$ start-hbase.sh
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '|BirthplaceToStu||'
> enable 'student'
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
> put 'student','stu-100001','basic_info:fractional','555'
> put 'student','stu-100002','basic_info:birthplace','jiangxi'
> put 'student','stu-100003','basic_info:fractional','556'
> scan 'student'
> scan 'birthplace_stu_index'
> exit
$ stop-hbase.sh
$ gedit $HBASE_HOME/conf/hbase-site.xml
  <property>
    <name>hbase.coprocessor.user.region.classes</name>
    <value>com.thinking.grpc.impl.FractionalInfoEndpoint</value>
  </property>
$ start-hbase.sh
*/


/*
####也可以动态部署
$ stop-hbase.sh
$ rm -rf $HBASE_HOME/lib/hell*
$ cp -r /home/yong/stu-hadoop20190717001/hello-hbase-coprocessor/target/hello-hbase-coprocessor-1.0-SNAPSHOT-jar-with-dependencies.jar $HBASE_HOME/lib/
$ gedit $HBASE_HOME/conf/hbase-site.xml
    #  <property>
    #    <name>hbase.coprocessor.user.region.classes</name>
    #    <value>BirthplaceToStu</value>
    #  </property>
$ rm -rf $HBASE_HOME/logs/
$ start-hbase.sh
$ hbase shell
> list
> create 'student',{NAME=>'basic_info'},{NAME=>'more_info'}
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '|BirthplaceToStu||'
> enable 'student'
> describe 'student'
> create 'birthplace_stu_index',{NAME=>'birthplace_stu_info'},{NAME=>'more_info'}
> put 'student','stu-100001','basic_info:birthplace','hubei'
> put 'student','stu-100001','basic_info:fractional','555'
> put 'student','stu-100002','basic_info:birthplace','jiangxi'
> put 'student','stu-100003','basic_info:fractional','556'
> scan 'student'
> scan 'birthplace_stu_index'
> disable 'student'
> alter 'student', METHOD => 'table_att', 'coprocessor' => '|com.thinking.grpc.impl.FractionalInfoEndpoint||'
> enable 'student'
> describe 'student'
*/

有些较老的资料上说FractionalInfoEndpoint implements Coprocessor, CoprocessorService，这样做发现在HBase 2.1.5上加载endpoint后，public void start(CoprocessorEnvironment env) throws IOException不执行，导致RegionCoprocessorEnvironment environment为空

客户端

pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.thinking</groupId>
    <artifactId>hello-hbase-coprocessor-client</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.2.0</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.google.protobuf/protobuf-java -->
        <dependency>
            <groupId>com.google.protobuf</groupId>
            <artifactId>protobuf-java</artifactId>
            <version>3.9.0</version>
        </dependency>

    </dependencies>


</project>

Main.java文件

import com.thinking.grpc.proto.StuFractionalProto;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.coprocessor.Batch;
import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;
import java.util.Map;

public class Main {
    /*
     * $ cd src/main/resources
     * $ mkdir hbase
     * $ cp -r $HBASE_HOME/conf/hbase-site.xml hbase/
     */
    public static void main(String[] args) throws Throwable {
        Configuration configuration = HBaseConfiguration.create();
        Connection connection = ConnectionFactory.createConnection(configuration);
        System.out.println("conn success!");

        TableName tableName = TableName.valueOf("student");
        Table table = connection.getTable(tableName);

        Put putBen = new Put(Bytes.toBytes("stu_001"));
        putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("name"), Bytes.toBytes("ben"));
        putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("gen"), Bytes.toBytes("M"));
        putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("age"), Bytes.toBytes("22"));
        putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("birthplace"), Bytes.toBytes("hubei"));
        putBen.addColumn(Bytes.toBytes("basic_info"), Bytes.toBytes("fractional"), Bytes.toBytes("555"));
        table.put(putBen);

        StuFractionalProto.GetFractionalInfoReq req = StuFractionalProto.GetFractionalInfoReq.newBuilder().setPlace("hubei").setType("MAX").build();
        Map<byte[], Float> result = table.coprocessorService(StuFractionalProto.FractionalInfoService.class, null, null, new Batch.Call<StuFractionalProto.FractionalInfoService, Float>() {
            @Override
            public Float call(StuFractionalProto.FractionalInfoService instance) throws IOException {
                CoprocessorRpcUtils.BlockingRpcCallback<StuFractionalProto.GetFractionalInfoResp> callback = new CoprocessorRpcUtils.BlockingRpcCallback();
                instance.getFractionalInfo(null, req, callback);
                return callback.get().getFractional();
            }
        });
        for (byte[] bytes : result.keySet()) {
            System.out.println(Bytes.toString(bytes) + "-->" + result.get(bytes));
        }
    }
}

客户端运行结果

conn success!
student,,1563956471472.eaaac0653cb76856c8fe3f75038cdccb.-->555.0

疑问

问：协处理器，特别是终端程序可以做的事情，Spark也可以在做。两者各有何优劣势？什么情况下该使用协处理器，什么情况下该使用Spark？

答：协处理器运行在RegionServer进程空间内，为了保证Region对常规读/写任务的支持，协处理器不宜做太大规模的遍历任务。类似于全表遍历，join操作等，宜用MR或者spark。

YongYu_IT

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
HBase协处理器

HBase协处理器简介作用类似于存储过程和触发器。在服务端运行，减少服务器/客户端IO类型类似于存储过程的“终端程序”（EndPoint）类似于触发器的“观察者”（Observers）协处理器家族观察者RegionObserver：针对region，监听关于region的操作RegionServerObserver：针对region服务器，监听整个Regio...
复制链接

扫一扫