转载地址:http://blog.csdn.net/xiao_jun_0820/article/details/27092831
hbase 自带的AggregationClient只能对单一列族的单一列进行聚合。如果想对多个列进行聚合的话,比如后面列子中说的salecount(销售量)和salemoney(销售金额),用AggregationClient只能调用两次,这样难免效率会比较低,而且两次调用一致性也不能保证(可能你sum完salecount后,再sum salemoney之前又插入了输入)。
所以只能实现一个自定义的endpoint coprocessor了。
首先自定义一个实现writable的类MyMutiSum,因为要在hadoop集群中进行传输,所以必须实现writable.用来返回每个列sum后的结果,该类实现如下:
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.hadoop.io.Writable;
- public class MyMutiSum implements Writable {
- private List<Long> resultList = new ArrayList<Long>();
- public MyMutiSum() {
- }
- public MyMutiSum(int resultSize) {
- for (int i = 0; i < resultSize; i++) {
- resultList.add(0L);
- }
- }
- public Long getSum(int i) {
- return resultList.get(i);
- }
- public void setSum(int i, Long sum) {
- resultList.set(i, sum);
- }
- public int getResultSize() {
- return resultList.size();
- }
- @Override
- public void write(DataOutput out) throws IOException {
- // TODO Auto-generated method stub
- out.writeInt(resultList.size());
- for (Long v : resultList) {
- out.writeLong(v);
- }
- }
- @Override
- public void readFields(DataInput in) throws IOException {
- // TODO Auto-generated method stub
- int size = in.readInt();
- for (int i = 0; i < size; i++) {
- resultList.add(in.readLong());
- }
- }
- }
- import java.io.IOException;
- import org.apache.hadoop.hbase.client.Scan;
- import org.apache.hadoop.hbase.ipc.CoprocessorProtocol;
- public interface MyCoprocessorProtocol extends CoprocessorProtocol {
- MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException;
- }
然后实现这个RPC协议的服务(方法):
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.hbase.KeyValue;
- import org.apache.hadoop.hbase.client.Scan;
- import org.apache.hadoop.hbase.coprocessor.BaseEndpointCoprocessor;
- import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
- import org.apache.hadoop.hbase.regionserver.InternalScanner;
- import org.apache.hadoop.hbase.util.Bytes;
- public class MyEndpointImpl extends BaseEndpointCoprocessor implements
- MyCoprocessorProtocol {
- protected static Log log = LogFactory.getLog(MyEndpointImpl.class);
- @Override
- public MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException {
- // TODO Auto-generated method stub
- MyMutiSum result = new MyMutiSum(columns.length);
- InternalScanner scanner = ((RegionCoprocessorEnvironment) getEnvironment())
- .getRegion().getScanner(scan);
- List<KeyValue> keyValues = new ArrayList<KeyValue>();
- try {
- boolean hasMoreRows = false;
- do {
- //循环每一个row的待sum的列,
- hasMoreRows = scanner.next(keyValues);
- for (int i = 0; i < columns.length; i++) {
- String column = columns[i];
- for (KeyValue kv : keyValues) {
- if (column.equals(Bytes.toString(kv.getQualifier()))) {
- byte[] value = kv.getValue();
- if (value == null || value.length == 0) {
- } else {
- Long tValue = Bytes.toLong(value);
- //如果是待sum的列,就将该列的值累加到之前的sum值上去。
- result.setSum(i, result.getSum(i) + tValue);
- }
- break;
- }
- }
- }
- keyValues.clear();
- } while (hasMoreRows);
- } finally {
- scanner.close();
- }
- log.debug("Sum from this region is "
- + ((RegionCoprocessorEnvironment) getEnvironment()).getRegion()
- .getRegionNameAsString() + ": ");
- for (int i = 0; i < columns.length; i++) {
- log.debug(columns[i] + " " + result.getSum(i));
- }
- //将sum后的自定义writable对象返回
- return result;
- }
- }
接下来我们可以实现一个rpc client:
- import java.io.IOException;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.hbase.HBaseConfiguration;
- import org.apache.hadoop.hbase.client.HTable;
- import org.apache.hadoop.hbase.client.Scan;
- import org.apache.hadoop.hbase.client.coprocessor.Batch;
- import org.apache.hadoop.hbase.util.Bytes;
- public class MyEndpointClient {
- protected static Log log = LogFactory.getLog(MyEndpointClient.class);
- private Configuration conf;
- public MyEndpointClient(Configuration conf) {
- this.conf = conf;
- }
- public MyMutiSum mutiSum(String tableName, String cf,
- final String[] columns, final Scan scan) throws Throwable {
- class MutiSumCallBack implements Batch.Callback<MyMutiSum> {
- MyMutiSum sumVal = null;
- public MyMutiSum getSumResult() {
- return sumVal;
- }
- @Override
- public void update(byte[] region, byte[] row, MyMutiSum result) {
- // TODO Auto-generated method stub
- sumVal = add(sumVal, result);
- }
- public MyMutiSum add(MyMutiSum l1, MyMutiSum l2) {
- if (l1 == null ^ l2 == null) {
- return (l1 == null) ? l2 : l1; // either of one is null.
- } else if (l1 == null) // both are null
- return null;
- MyMutiSum mutiSum = new MyMutiSum(columns.length);
- for (int i = 0; i < columns.length; i++) {
- mutiSum.setSum(i, l1.getSum(i) + l2.getSum(i));
- }
- return mutiSum;
- }
- }
- MutiSumCallBack sumCallBack = new MutiSumCallBack();
- HTable table = null;
- for (int i = 0; i < columns.length; i++) {
- scan.addColumn(Bytes.toBytes(cf), Bytes.toBytes(columns[i]));
- }
- try {
- table = new HTable(conf, tableName);
- // 根据startRow和stopRow确定regionserver,即RPC
- // SERVER,在startRow~stopRow范围的region上执行rpc调用
- // 所以这个方法其实发起了多个RPC调用,每个RPC调用返回后都会调用sumCallBack的update方法,将自己执行的结果通过update方法累加到sumCallBack的sumVal上
- table.coprocessorExec(MyCoprocessorProtocol.class,
- scan.getStartRow(), scan.getStopRow(),
- new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() {
- @Override
- public MyMutiSum call(MyCoprocessorProtocol instance)
- throws IOException {
- // TODO Auto-generated method stub
- // instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用
- return instance.getMutiSum(columns, scan);
- }
- }, sumCallBack);
- } finally {
- if (table != null) {
- table.close();
- }
- }
- // 返回的是sumCallBack的sumVal,即所有region结果通过update的累加。
- return sumCallBack.getSumResult();
- }
- }
ok,现在我们将这三个类打包成MutiSum.jar,上传到hdfs上去,我这里传的目录为hdfs://master24:9000/user/hadoop/jars/MutiSum.jar。
接下来我们将这个自定义的cp设置到member4这个表上去,通过API来实现:
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.hbase.HBaseConfiguration;
- import org.apache.hadoop.hbase.HTableDescriptor;
- import org.apache.hadoop.hbase.MasterNotRunningException;
- import org.apache.hadoop.hbase.client.HBaseAdmin;
- import org.apache.hadoop.hbase.util.Bytes;
- public class SetCoprocessor {
- /**
- * @param args
- * @throws Exception
- * @throws MasterNotRunningException
- */
- public static void main(String[] args) throws MasterNotRunningException,
- Exception {
- // TODO Auto-generated method stub
- byte[] tableName = Bytes.toBytes("member4");
- Configuration conf = HBaseConfiguration.create();
- HBaseAdmin admin = new HBaseAdmin(conf);
- admin.disableTable(tableName);
- HTableDescriptor htd = admin.getTableDescriptor(tableName);
- htd.addCoprocessor("com.besttone.coprocessor.MyEndpointImpl", new Path(
- "hdfs://master24:9000/user/hadoop/jars/MutiSum.jar"), 1001,
- null);
- admin.modifyTable(tableName, htd);
- admin.enableTable(tableName);
- admin.close();
- }
- }
所有都准备就绪了,接下来就可以写一个main函数来测试调用一下这个cp了:
- /**
- * @param args
- * @throws Throwable
- */
- public static void main(String[] args) throws Throwable {
- // TODO Auto-generated method stub
- // 我们建了一个测试表member4,里面info列族上有两个列,一个salecount销售量,一个salemoney销售额,我们通过上面自定义的cp,返回总销售量和总销售额
- final String[] columns = new String[] { "salecount", "salemoney" };
- Configuration conf = HBaseConfiguration.create();
- final Scan scan;
- scan = new Scan();
- scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salecount"));
- scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salemoney"));
- MyEndpointClient client = new MyEndpointClient(conf);
- MyMutiSum mutiSum = client.mutiSum("member4", "info", columns, scan);
- for (int i = 0; i < columns.length; i++) {
- System.out.println(columns[i] + " sum is :" + mutiSum.getSum(i));
- }
- }
针对以下代码补充说明一下:
- table.coprocessorExec(MyCoprocessorProtocol.class,
- scan.getStartRow(), scan.getStopRow(),
- new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() {
- @Override
- public MyMutiSum call(MyCoprocessorProtocol instance)
- throws IOException {
- // TODO Auto-generated method stub
- // instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用
- return instance.getMutiSum(columns, scan);
- }
- }, sumCallBack);
hbase 权威指南上的例子是在call方法内部调用多个instance的方法,然后return 一个Pair,这种也是可行的,不过还是尽量封装成调用instance 的一个方法发起一个RPC,调用多个方法其实发起的RPC调用更多。