hbase 自带的AggregationClient只能对单一列族的单一列进行聚合。如果想对多个列进行聚合的话,比如后面列子中说的salecount(销售量)和salemoney(销售金额),用AggregationClient只能调用两次,这样难免效率会比较低,而且两次调用一致性也不能保证(可能你sum完salecount后,再sum salemoney之前又插入了输入)。
所以只能实现一个自定义的endpoint coprocessor了。
首先自定义一个实现writable的类MyMutiSum,因为要在hadoop集群中进行传输,所以必须实现writable.用来返回每个列sum后的结果,该类实现如下:
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.Writable;
public class MyMutiSum implements Writable {
private List<Long> resultList = new ArrayList<Long>();
public MyMutiSum() {
}
public MyMutiSum(int resultSize) {
for (int i = 0; i < resultSize; i++) {
resultList.add(0L);
}
}
public Long getSum(int i) {
return resultList.get(i);
}
public void setSum(int i, Long sum) {
resultList.set(i, sum);
}
public int getResultSize() {
return resultList.size();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeInt(resultList.size());
for (Long v : resultList) {
out.writeLong(v);
}
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
int size = in.readInt();
for (int i = 0; i < size; i++) {
resultList.add(in.readLong());
}
}
}
然后自定义一个RPC协议,里面有个方法的参数columns是将你要进行sum的多个列都传过去作为参数:
import java.io.IOException;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.ipc.CoprocessorProtocol;
public interface MyCoprocessorProtocol extends CoprocessorProtocol {
MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException;
}
然后实现这个RPC协议的服务(方法):
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.BaseEndpointCoprocessor;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.regionserver.InternalScanner;
import org.apache.hadoop.hbase.util.Bytes;
public class MyEndpointImpl extends BaseEndpointCoprocessor implements
MyCoprocessorProtocol {
protected static Log log = LogFactory.getLog(MyEndpointImpl.class);
@Override
public MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException {
// TODO Auto-generated method stub
MyMutiSum result = new MyMutiSum(columns.length);
InternalScanner scanner = ((RegionCoprocessorEnvironment) getEnvironment())
.getRegion().getScanner(scan);
List<KeyValue> keyValues = new ArrayList<KeyValue>();
try {
boolean hasMoreRows = false;
do {
//循环每一个row的待sum的列,
hasMoreRows = scanner.next(keyValues);
for (int i = 0; i < columns.length; i++) {
String column = columns[i];
for (KeyValue kv : keyValues) {
if (column.equals(Bytes.toString(kv.getQualifier()))) {
byte[] value = kv.getValue();
if (value == null || value.length == 0) {
} else {
Long tValue = Bytes.toLong(value);
//如果是待sum的列,就将该列的值累加到之前的sum值上去。
result.setSum(i, result.getSum(i) + tValue);
}
break;
}
}
}
keyValues.clear();
} while (hasMoreRows);
} finally {
scanner.close();
}
log.debug("Sum from this region is "
+ ((RegionCoprocessorEnvironment) getEnvironment()).getRegion()
.getRegionNameAsString() + ": ");
for (int i = 0; i < columns.length; i++) {
log.debug(columns[i] + " " + result.getSum(i));
}
//将sum后的自定义writable对象返回
return result;
}
}
接下来我们可以实现一个rpc client:
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.coprocessor.Batch;
import org.apache.hadoop.hbase.util.Bytes;
public class MyEndpointClient {
protected static Log log = LogFactory.getLog(MyEndpointClient.class);
private Configuration conf;
public MyEndpointClient(Configuration conf) {
this.conf = conf;
}
public MyMutiSum mutiSum(String tableName, String cf,
final String[] columns, final Scan scan) throws Throwable {
class MutiSumCallBack implements Batch.Callback<MyMutiSum> {
MyMutiSum sumVal = null;
public MyMutiSum getSumResult() {
return sumVal;
}
@Override
public void update(byte[] region, byte[] row, MyMutiSum result) {
// TODO Auto-generated method stub
sumVal = add(sumVal, result);
}
public MyMutiSum add(MyMutiSum l1, MyMutiSum l2) {
if (l1 == null ^ l2 == null) {
return (l1 == null) ? l2 : l1; // either of one is null.
} else if (l1 == null) // both are null
return null;
MyMutiSum mutiSum = new MyMutiSum(columns.length);
for (int i = 0; i < columns.length; i++) {
mutiSum.setSum(i, l1.getSum(i) + l2.getSum(i));
}
return mutiSum;
}
}
MutiSumCallBack sumCallBack = new MutiSumCallBack();
HTable table = null;
for (int i = 0; i < columns.length; i++) {
scan.addColumn(Bytes.toBytes(cf), Bytes.toBytes(columns[i]));
}
try {
table = new HTable(conf, tableName);
// 根据startRow和stopRow确定regionserver,即RPC
// SERVER,在startRow~stopRow范围的region上执行rpc调用
// 所以这个方法其实发起了多个RPC调用,每个RPC调用返回后都会调用sumCallBack的update方法,将自己执行的结果通过update方法累加到sumCallBack的sumVal上
table.coprocessorExec(MyCoprocessorProtocol.class,
scan.getStartRow(), scan.getStopRow(),
new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() {
@Override
public MyMutiSum call(MyCoprocessorProtocol instance)
throws IOException {
// TODO Auto-generated method stub
// instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用
return instance.getMutiSum(columns, scan);
}
}, sumCallBack);
} finally {
if (table != null) {
table.close();
}
}
// 返回的是sumCallBack的sumVal,即所有region结果通过update的累加。
return sumCallBack.getSumResult();
}
}
ok,现在我们将这三个类打包成MutiSum.jar,上传到hdfs上去,我这里传的目录为hdfs://master24:9000/user/hadoop/jars/MutiSum.jar。
接下来我们将这个自定义的cp设置到member4这个表上去,通过API来实现:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.util.Bytes;
public class SetCoprocessor {
/**
* @param args
* @throws Exception
* @throws MasterNotRunningException
*/
public static void main(String[] args) throws MasterNotRunningException,
Exception {
// TODO Auto-generated method stub
byte[] tableName = Bytes.toBytes("member4");
Configuration conf = HBaseConfiguration.create();
HBaseAdmin admin = new HBaseAdmin(conf);
admin.disableTable(tableName);
HTableDescriptor htd = admin.getTableDescriptor(tableName);
htd.addCoprocessor("com.besttone.coprocessor.MyEndpointImpl", new Path(
"hdfs://master24:9000/user/hadoop/jars/MutiSum.jar"), 1001,
null);
admin.modifyTable(tableName, htd);
admin.enableTable(tableName);
admin.close();
}
}
所有都准备就绪了,接下来就可以写一个main函数来测试调用一下这个cp了:
/**
* @param args
* @throws Throwable
*/
public static void main(String[] args) throws Throwable {
// TODO Auto-generated method stub
// 我们建了一个测试表member4,里面info列族上有两个列,一个salecount销售量,一个salemoney销售额,我们通过上面自定义的cp,返回总销售量和总销售额
final String[] columns = new String[] { "salecount", "salemoney" };
Configuration conf = HBaseConfiguration.create();
final Scan scan;
scan = new Scan();
scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salecount"));
scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salemoney"));
MyEndpointClient client = new MyEndpointClient(conf);
MyMutiSum mutiSum = client.mutiSum("member4", "info", columns, scan);
for (int i = 0; i < columns.length; i++) {
System.out.println(columns[i] + " sum is :" + mutiSum.getSum(i));
}
}
针对以下代码补充说明一下:
table.coprocessorExec(MyCoprocessorProtocol.class,
scan.getStartRow(), scan.getStopRow(),
new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() {
@Override
public MyMutiSum call(MyCoprocessorProtocol instance)
throws IOException {
// TODO Auto-generated method stub
// instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用
return instance.getMutiSum(columns, scan);
}
}, sumCallBack);
hbase 权威指南上的例子是在call方法内部调用多个instance的方法,然后return 一个Pair,这种也是可行的,不过还是尽量封装成调用instance 的一个方法发起一个RPC,调用多个方法其实发起的RPC调用更多。