不知道为什么,hbase0.96中只定义了一个AggregateService的RPC协议,并没有具体的实现类和客户端调用类,官网新的0.98版本里又出现了这两个类,于是把0.98里的这两个类搬来了,在0.96上测试通过可以使用。
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.NavigableSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.Coprocessor;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.ColumnInterpreter;
import org.apache.hadoop.hbase.coprocessor.CoprocessorException;
import org.apache.hadoop.hbase.coprocessor.CoprocessorService;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.ResponseConverter;
import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateRequest;
import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateResponse;
import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateService;
import org.apache.hadoop.hbase.regionserver.InternalScanner;
import com.google.protobuf.ByteString;
import com.google.protobuf.Message;
import com.google.protobuf.RpcCallback;
import com.google.protobuf.RpcController;
import com.google.protobuf.Service;
/**
* A concrete AggregateProtocol implementation. Its system level coprocessor
* that computes the aggregate function at a region level.
* {@link ColumnInterpreter} is used to interpret column value. This class is
* parameterized with the following (these are the types with which the {@link ColumnInterpreter}
* is parameterized, and for more description on these, refer to {@link ColumnInterpreter}):
* @param <T> Cell value data type
* @param <S> Promoted data type
* @param <P> PB message that is used to transport initializer specific bytes
* @param <Q> PB message that is used to transport Cell (<T>) instance
* @param <R> PB message that is used to transport Promoted (<S>) instance
*/
@InterfaceAudience.Private
public class AggregateImplementation<T, S, P extends Message, Q extends Message, R extends Message>
extends AggregateService implements CoprocessorService, Coprocessor {
protected static final Log log = LogFactory.getLog(AggregateImplementation.class);
private RegionCoprocessorEnvironment env;
/**
* Gives the maximum for a given combination of column qualifier and column
* family, in the given row range as defined in the Scan object. In its
* current implementation, it takes one column family and one column qualifier
* (if provided). In case of null column qualifier, maximum value for the
* entire column family will be returned.
*/
@Override
public void getMax(RpcController controller, AggregateRequest request,
RpcCallback<AggregateResponse> done) {
InternalScanner scanner = null;
AggregateResponse response = null;
T max = null;
try {
ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);
T temp;
Scan scan = ProtobufUtil.toScan(request.getScan());
scanner = env.getRegion().getScanner(scan);
List<Cell> results = new ArrayList<Cell>();
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);
byte[] qualifier = null;
if (qualifiers != null && !qualifiers.isEmpty()) {
qualifier = qualifiers.pollFirst();
}
// qualifier can be null.
boolean hasMoreRows = false;
do {
hasMoreRows = scanner.next(results);
for (Cell kv : results) {
temp = ci.getValue(colFamily, qualifier, kv);
max = (max == null || (temp != null && ci.compare(temp, max) > 0)) ? temp : max;
}
results.clear();
} while (hasMoreRows);
if (max != null) {
AggregateResponse.Builder builder = AggregateResponse.newBuilder();
builder.addFirstPart(ci.getProtoForCellType(max).toByteString());
response = builder.build();
}
} catch (IOException e) {
ResponseConverter.setControllerException(controller, e);
} finally {
if (scanner != null) {
try {
scanner.close();
} catch (IOException ignored) {}
}
}
log.info("Maximum from this region is "
+ env.getRegion().getRegionNameAsString() + ": " + max);
done.run(response);
}
/**
* Gives the minimum for a given combination of column qualifier and column
* family, in the given row range as defined in the Scan object. In its
* current implementation, it takes one column family and one column qualifier
* (if provided). In case of null column qualifier, minimum value for the
* entire column family will be returned.
*/
@Override
public void getMin(RpcController controller, AggregateRequest request,
RpcCallback<AggregateResponse> done) {
AggregateResponse response = null;
InternalScanner scanner = null;
T min = null;
try {
ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);
T temp;
Scan scan = ProtobufUtil.toScan(request.getScan());
scanner = env.getRegion().getScanner(scan);
List<Cell> results = new ArrayList<Cell>();
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);
byte[] qualifier = null;
if (qualifiers != null && !qualifiers.isEmpty()) {
qualifier = qualifiers.pollFirst();
}
boolean hasMoreRows = false;
do {
hasMoreRows = scanner.next(results);
for (Cell kv : results) {
temp = ci.getValue(colFamily, qualifier, kv);
min = (min == null || (temp != null && ci.compare(temp, min) < 0)) ? temp : min;
}
results.clear();
} while (hasMoreRows);
if (min != null) {
response = AggregateResponse.newBuilder().addFirstPart(
ci.getProtoForCellType(min).toByteString()).build();
}
} catch (IOException e) {
ResponseConverter.setControllerException(controller, e);
} finally {
if (scanner != null) {
try {
scanner.close();
} catch (IOException ignored) {}
}
}
log.info("Minimum from this region is "
+ env.getRegion().getRegionNameAsString() + ": " + min);
done.run(response);
}
/**
* Gives the sum for a given combination of column qualifier and column
* family, in the given row range as defined in the Scan object. In its
* current implementation, it takes one column family and one column qualifier
* (if provided). In case of null column qualifier, sum for the entire column
* family will be returned.
*/
@Override
public void getSum(RpcController controller, AggregateRequest request,
RpcCallback<AggregateResponse> done) {
AggregateResponse response = null;
InternalScanner scanner = null;
long sum = 0l;
try {
ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);
S sumVal = null;
T temp;
Scan scan = ProtobufUtil.toScan(request.getScan());
scanner = env.getRegion().getScanner(scan);
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);
byte[] qualifier = null;
if (qualifiers != null && !qualifiers.isEmpty()) {
qualifier = qualifiers.pollFirst();
}
List<Cell> results = new ArrayList<Cell>();
boolean hasMoreRows = false;
do {
hasMoreRows = scanner.next(results);
for (Cell kv : results) {
temp = ci.getValue(colFamily, qualifier, kv);
if (temp != null)
sumVal = ci.add(sumVal, ci.castToReturnType(temp));
}
results.clear();
} while (hasMoreRows);
if (sumVal != null) {
response = AggregateResponse.newBuilder().addFirstPart(
ci.getProtoForPromotedType(sumVal).toByteString()).build();
}
} catch (IOException e) {
ResponseConverter.setControllerException(controller, e);
} finally {
if (scanner != null) {
try {
scanner.close();
} catch (IOException ignored) {}
}
}
log.debug("Sum from this region is "
+ env.getRegion().getRegionNameAsString() + ": " + sum);
done.run(response);
}
/**
* Gives the row count for the given column family and column qualifier, in
* the given row range as defined in the Scan object.
* @throws IOException
*/
@Override
public void getRowNum(RpcController controller, AggregateRequest request,
RpcCallback<AggregateResponse> done) {
AggregateResponse response = null;
long counter = 0l;
List<Cell> results = new ArrayList<Cell>();
InternalScanner scanner = null;
try {
Scan scan = ProtobufUtil.toScan(request.getScan());
byte[][] colFamilies = scan.getFamilies();
byte[] colFamily = colFamilies != null ? colFamilies[0] : null;
NavigableSet<byte[]> qualifiers = colFamilies != null ?
scan.getFamilyMap().get(colFamily) : null;
byte[] qualifier = null;
if (qualifiers != null && !qualifiers.isEmpty()) {
qualifier = qualifiers.pollFirst();
}
if (scan.getFilter() == null && qualifier == null)
scan.setFilter(new FirstKeyOnlyFilter());
scanner = env.getRegion().getScanner(scan);
boolean hasMoreRows = false;
do {
hasMoreRows = scanner.next(results);
if (results.size() > 0) {
counter++;
}
results.clear();
} while (hasMoreRows);
ByteBuffer bb = ByteBuffer.allocate(8).putLong(counter);
bb.rewind();
response = AggregateResponse.newBuilder().addFirstPart(
ByteString.copyFrom(bb)).build();
} catch (IOException e) {
ResponseConverter.setControllerException(controller, e);
} finally {
if (scanner != null) {
try {
scanner.close();
} catch (IOException ignored) {}
}
}
log.info("Row counter from this region is "
+ env.getRegion().getRegionNameAsString() + ": " + counter);
done.run(response);
}
/**
* Gives a Pair with first object as Sum and second object as row count,
* computed for a given combination of column qualifier and column family in
* the given row range as defined in the Scan object. In its current
* implementation, it takes one column family and one column qualifier (if
* provided). In case of null column qualifier, an aggregate sum over all the
* entire column family will be returned.
* <p>
* The average is computed in
* AggregationClient#avg(byte[], ColumnInterpreter, Scan) by
* processing results from all regions, so its "ok" to pass sum and a Long
* type.
*/
@Override
public void getAvg(RpcController controller, AggregateRequest request,
RpcCallback<AggregateResponse> done) {
AggregateResponse response = null;
InternalScanner scanner = null;
try {
ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);
S sumVal = null;
Long rowCountVal = 0l;
Scan scan = ProtobufUtil.toScan(request.getScan());
scanner = env.getRegion().getScanner(scan);
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);
byte[] qualifier = null;
if (qualifiers != null && !qualifiers.isEmpty()) {
qualifier = qualifiers.pollFirst();
}
List<Cell> results = new ArrayList<Cell>();
boolean hasMoreRows = false;
do {
results.clear();
hasMoreRows = scanner.next(results);
for (Cell kv : results) {
sumVal = ci.add(sumVal, ci.castToReturnType(ci.getValue(colFamily,
qualifier, kv)));
}
rowCountVal++;
} while (hasMoreRows);
if (sumVal != null) {
ByteString first = ci.getProtoForPromotedType(sumVal).toByteString();
AggregateResponse.Builder pair = AggregateResponse.newBuilder();
pair.addFirstPart(first);
ByteBuffer bb = ByteBuffer.allocate(8).putLong(rowCountVal);
bb.rewind();
pair.setSecondPart(ByteString.copyFrom(bb));
response = pair.build();
}
} catch (IOException e) {
ResponseConverter.setControllerException(controller, e);
} finally {
if (scanner != null) {
try {
scanner.close();
} catch (IOException ignored) {}
}
}
done.run(response);
}
/**
* Gives a Pair with first object a List containing Sum and sum of squares,
* and the second object as row count. It is computed for a given combination of
* column qualifier and column family in the given row range as defined in the
* Scan object. In its current implementation, it takes one column family and
* one column qualifier (if provided). The idea is get the value of variance first:
* the average of the squares less the square of the average a standard
* deviation is square root of variance.
*/
@Override
public void getStd(RpcController controller, AggregateRequest request,
RpcCallback<AggregateResponse> done) {
InternalScanner scanner = null;
AggregateResponse response = null;
try {
ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);
S sumVal = null, sumSqVal = null, tempVal = null;
long rowCountVal = 0l;
Scan scan = ProtobufUtil.toScan(request.getScan());
scanner = env.getRegion().getScanner(scan);
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);
byte[] qualifier = null;
if (qualifiers != null && !qualifiers.isEmpty()) {
qualifier = qualifiers.pollFirst();
}
List<Cell> results = new ArrayList<Cell>();
boolean hasMoreRows = false;
do {
tempVal = null;
hasMoreRows = scanner.next(results);
for (Cell kv : results) {
tempVal = ci.add(tempVal, ci.castToReturnType(ci.getValue(colFamily,
qualifier, kv)));
}
results.clear();
sumVal = ci.add(sumVal, tempVal);
sumSqVal = ci.add(sumSqVal, ci.multiply(tempVal, tempVal));
rowCountVal++;
} while (hasMoreRows);
if (sumVal != null) {
ByteString first_sumVal = ci.getProtoForPromotedType(sumVal).toByteString();
ByteString first_sumSqVal = ci.getProtoForPromotedType(sumSqVal).toByteString();
AggregateResponse.Builder pair = AggregateResponse.newBuilder();
pair.addFirstPart(first_sumVal);
pair.addFirstPart(first_sumSqVal);
ByteBuffer bb = ByteBuffer.allocate(8).putLong(rowCountVal);
bb.rewind();
pair.setSecondPart(ByteString.copyFrom(bb));
response = pair.build();
}
} catch (IOException e) {
ResponseConverter.setControllerException(controller, e);
} finally {
if (scanner != null) {
try {
scanner.close();
} catch (IOException ignored) {}
}
}
done.run(response);
}
/**
* Gives a List containing sum of values and sum of weights.
* It is computed for the combination of column
* family and column qualifier(s) in the given row range as defined in the
* Scan object. In its current implementation, it takes one column family and
* two column qualifiers. The first qualifier is for values column and
* the second qualifier (optional) is for weight column.
*/
@Override
public void getMedian(RpcController controller, AggregateRequest request,
RpcCallback<AggregateResponse> done) {
AggregateResponse response = null;
InternalScanner scanner = null;
try {
ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);
S sumVal = null, sumWeights = null, tempVal = null, tempWeight = null;
Scan scan = ProtobufUtil.toScan(request.getScan());
scanner = env.getRegion().getScanner(scan);
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);
byte[] valQualifier = null, weightQualifier = null;
if (qualifiers != null && !qualifiers.isEmpty()) {
valQualifier = qualifiers.pollFirst();
// if weighted median is requested, get qualifier for the weight column
weightQualifier = qualifiers.pollLast();
}
List<Cell> results = new ArrayList<Cell>();
boolean hasMoreRows = false;
do {
tempVal = null;
tempWeight = null;
hasMoreRows = scanner.next(results);
for (Cell kv : results) {
tempVal = ci.add(tempVal, ci.castToReturnType(ci.getValue(colFamily,
valQualifier, kv)));
if (weightQualifier != null) {
tempWeight = ci.add(tempWeight,
ci.castToReturnType(ci.getValue(colFamily, weightQualifier, kv)));
}
}
results.clear();
sumVal = ci.add(sumVal, tempVal);
sumWeights = ci.add(sumWeights, tempWeight);
} while (hasMoreRows);
ByteString first_sumVal = ci.getProtoForPromotedType(sumVal).toByteString();
S s = sumWeights == null ? ci.castToReturnType(ci.getMinValue()) : sumWeights;
ByteString first_sumWeights = ci.getProtoForPromotedType(s).toByteString();
AggregateResponse.Builder pair = AggregateResponse.newBuilder();
pair.addFirstPart(first_sumVal);
pair.addFirstPart(first_sumWeights);
response = pair.build();
} catch (IOException e) {
ResponseConverter.setControllerException(controller, e);
} finally {
if (scanner != null) {
try {
scanner.close();
} catch (IOException ignored) {}
}
}
done.run(response);
}
@SuppressWarnings("unchecked")
ColumnInterpreter<T,S,P,Q,R> constructColumnInterpreterFromRequest(
AggregateRequest request) throws IOException {
String className = request.getInterpreterClassName();
Class<?> cls;
try {
cls = Class.forName(className);
ColumnInterpreter<T,S,P,Q,R> ci = (ColumnInterpreter<T, S, P, Q, R>) cls.newInstance();
if (request.hasInterpreterSpecificBytes()) {
ByteString b = request.getInterpreterSpecificBytes();
P initMsg = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 2, b);
ci.initialize(initMsg);
}
return ci;
} catch (ClassNotFoundException e) {
throw new IOException(e);
} catch (InstantiationException e) {
throw new IOException(e);
} catch (IllegalAccessException e) {
throw new IOException(e);
}
}
@Override
public Service getService() {
return this;
}
/**
* Stores a reference to the coprocessor environment provided by the
* {@link org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost} from the region where this
* coprocessor is loaded. Since this is a coprocessor endpoint, it always expects to be loaded
* on a table region, so always expects this to be an instance of
* {@link RegionCoprocessorEnvironment}.
* @param env the environment provided by the coprocessor host
* @throws IOException if the provided environment is not an instance of
* {@code RegionCoprocessorEnvironment}
*/
@Override
public void start(CoprocessorEnvironment env) throws IOException {
if (env instanceof RegionCoprocessorEnvironment) {
this.env = (RegionCoprocessorEnvironment)env;
} else {
throw new CoprocessorException("Must be loaded on a table region!");
}
}
@Override
public void stop(CoprocessorEnvironment env) throws IOException {
// nothing to do
}
}
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.coprocessor.Batch;
import org.apache.hadoop.hbase.coprocessor.ColumnInterpreter;
import org.apache.hadoop.hbase.ipc.BlockingRpcCallback;
import org.apache.hadoop.hbase.ipc.ServerRpcController;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateRequest;
import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateResponse;
import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateService;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.yarn.webapp.hamlet.HamletSpec.Q;
import com.google.protobuf.ByteString;
import com.google.protobuf.Message;
/**
* This client class is for invoking the aggregate functions deployed on the
* Region Server side via the AggregateService. This class will implement the
* supporting functionality for summing/processing the individual results
* obtained from the AggregateService for each region.
* <p>
* This will serve as the client side handler for invoking the aggregate
* functions.
* <ul>
* For all aggregate functions,
* <li>start row < end row is an essential condition (if they are not
* {@link HConstants#EMPTY_BYTE_ARRAY})
* <li>Column family can't be null. In case where multiple families are
* provided, an IOException will be thrown. An optional column qualifier can
* also be defined.
* <li>For methods to find maximum, minimum, sum, rowcount, it returns the
* parameter type. For average and std, it returns a double value. For row
* count, it returns a long value.
*/
@InterfaceAudience.Private
public class AggregationClient {
private static final Log log = LogFactory.getLog(AggregationClient.class);
Configuration conf;
/**
* Constructor with Conf object
*
* @param cfg
*/
public AggregationClient(Configuration cfg) {
this.conf = cfg;
}
/**
* It gives the maximum value of a column for a given column family for the
* given range. In case qualifier is null, a max of all values for the given
* family is returned.
*
* @param tableName
* @param ci
* @param scan
* @return max val <R>
* @throws Throwable
* The caller is supposed to handle the exception as they are
* thrown & propagated to it.
*/
public <R, S, P extends Message, Q extends Message, T extends Message> R max(
final TableName tableName,
final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)
throws Throwable {
HTable table = null;
try {
table = new HTable(conf, tableName);
return max(table, ci, scan);
} finally {
if (table != null) {
table.close();
}
}
}
/**
* It gives the maximum value of a column for a given column family for the
* given range. In case qualifier is null, a max of all values for the given
* family is returned.
*
* @param table
* @param ci
* @param scan
* @return max val <R>
* @throws Throwable
* The caller is supposed to handle the exception as they are
* thrown & propagated to it.
*/
public <R, S, P extends Message, Q extends Message, T extends Message> R max(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
final Scan scan) throws Throwable {
final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);
class MaxCallBack implements Batch.Callback<R> {
R max = null;
R getMax() {
return max;
}
@Override
public synchronized void update(byte[] region, byte[] row, R result) {
max = (max == null || (result != null && ci
.compare(max, result) < 0)) ? result : max;
}
}
MaxCallBack aMaxCallBack = new MaxCallBack();
table.coprocessorService(AggregateService.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateService, R>() {
@Override
public R call(AggregateService instance) throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();
instance.getMax(controller, requestArg, rpcCallback);
AggregateResponse response = rpcCallback.get();
if (controller.failedOnException()) {
throw controller.getFailedOn();
}
if (response.getFirstPartCount() > 0) {
ByteString b = response.getFirstPart(0);
Q q = ProtobufUtil.getParsedGenericInstance(
ci.getClass(), 3, b);
return ci.getCellValueFromProto(q);
}
return null;
}
}, aMaxCallBack);
return aMaxCallBack.getMax();
}
/*
* @param scan
*
* @param canFamilyBeAbsent whether column family can be absent in familyMap
* of scan
*/
private void validateParameters(Scan scan, boolean canFamilyBeAbsent)
throws IOException {
if (scan == null
|| (Bytes.equals(scan.getStartRow(), scan.getStopRow()) && !Bytes
.equals(scan.getStartRow(), HConstants.EMPTY_START_ROW))
|| ((Bytes.compareTo(scan.getStartRow(), scan.getStopRow()) > 0) && !Bytes
.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW))) {
throw new IOException(
"Agg client Exception: Startrow should be smaller than Stoprow");
} else if (!canFamilyBeAbsent) {
if (scan.getFamilyMap().size() != 1) {
throw new IOException("There must be only one family.");
}
}
}
/**
* It gives the minimum value of a column for a given column family for the
* given range. In case qualifier is null, a min of all values for the given
* family is returned.
*
* @param tableName
* @param ci
* @param scan
* @return min val <R>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> R min(
final TableName tableName,
final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)
throws Throwable {
HTable table = null;
try {
table = new HTable(conf, tableName);
return min(table, ci, scan);
} finally {
if (table != null) {
table.close();
}
}
}
/**
* It gives the minimum value of a column for a given column family for the
* given range. In case qualifier is null, a min of all values for the given
* family is returned.
*
* @param table
* @param ci
* @param scan
* @return min val <R>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> R min(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
final Scan scan) throws Throwable {
final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);
class MinCallBack implements Batch.Callback<R> {
private R min = null;
public R getMinimum() {
return min;
}
@Override
public synchronized void update(byte[] region, byte[] row, R result) {
min = (min == null || (result != null && ci
.compare(result, min) < 0)) ? result : min;
}
}
MinCallBack minCallBack = new MinCallBack();
table.coprocessorService(AggregateService.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateService, R>() {
@Override
public R call(AggregateService instance) throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();
instance.getMin(controller, requestArg, rpcCallback);
AggregateResponse response = rpcCallback.get();
if (controller.failedOnException()) {
throw controller.getFailedOn();
}
if (response.getFirstPartCount() > 0) {
ByteString b = response.getFirstPart(0);
Q q = ProtobufUtil.getParsedGenericInstance(
ci.getClass(), 3, b);
return ci.getCellValueFromProto(q);
}
return null;
}
}, minCallBack);
log.debug("Min fom all regions is: " + minCallBack.getMinimum());
return minCallBack.getMinimum();
}
/**
* It gives the row count, by summing up the individual results obtained
* from regions. In case the qualifier is null, FirstKeyValueFilter is used
* to optimised the operation. In case qualifier is provided, I can't use
* the filter as it may set the flag to skip to next row, but the value read
* is not of the given filter: in this case, this particular row will not be
* counted ==> an error.
*
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> long rowCount(
final TableName tableName,
final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)
throws Throwable {
HTable table = null;
try {
table = new HTable(conf, tableName);
return rowCount(table, ci, scan);
} finally {
if (table != null) {
table.close();
}
}
}
/**
* It gives the row count, by summing up the individual results obtained
* from regions. In case the qualifier is null, FirstKeyValueFilter is used
* to optimised the operation. In case qualifier is provided, I can't use
* the filter as it may set the flag to skip to next row, but the value read
* is not of the given filter: in this case, this particular row will not be
* counted ==> an error.
*
* @param table
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> long rowCount(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
final Scan scan) throws Throwable {
final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, true);
class RowNumCallback implements Batch.Callback<Long> {
private final AtomicLong rowCountL = new AtomicLong(0);
public long getRowNumCount() {
return rowCountL.get();
}
@Override
public void update(byte[] region, byte[] row, Long result) {
rowCountL.addAndGet(result.longValue());
}
}
RowNumCallback rowNum = new RowNumCallback();
table.coprocessorService(AggregateService.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateService, Long>() {
@Override
public Long call(AggregateService instance)
throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();
instance.getRowNum(controller, requestArg, rpcCallback);
AggregateResponse response = rpcCallback.get();
if (controller.failedOnException()) {
throw controller.getFailedOn();
}
byte[] bytes = getBytesFromResponse(response
.getFirstPart(0));
ByteBuffer bb = ByteBuffer.allocate(8).put(bytes);
bb.rewind();
return bb.getLong();
}
}, rowNum);
return rowNum.getRowNumCount();
}
/**
* It sums up the value returned from various regions. In case qualifier is
* null, summation of all the column qualifiers in the given family is done.
*
* @param tableName
* @param ci
* @param scan
* @return sum <S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> S sum(
final TableName tableName,
final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)
throws Throwable {
HTable table = null;
try {
table = new HTable(conf, tableName);
return sum(table, ci, scan);
} finally {
if (table != null) {
table.close();
}
}
}
/**
* It sums up the value returned from various regions. In case qualifier is
* null, summation of all the column qualifiers in the given family is done.
*
* @param table
* @param ci
* @param scan
* @return sum <S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> S sum(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
final Scan scan) throws Throwable {
final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);
class SumCallBack implements Batch.Callback<S> {
S sumVal = null;
public S getSumResult() {
return sumVal;
}
@Override
public synchronized void update(byte[] region, byte[] row, S result) {
sumVal = ci.add(sumVal, result);
}
}
SumCallBack sumCallBack = new SumCallBack();
table.coprocessorService(AggregateService.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateService, S>() {
@Override
public S call(AggregateService instance) throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();
instance.getSum(controller, requestArg, rpcCallback);
AggregateResponse response = rpcCallback.get();
if (controller.failedOnException()) {
throw controller.getFailedOn();
}
if (response.getFirstPartCount() == 0) {
return null;
}
ByteString b = response.getFirstPart(0);
T t = ProtobufUtil.getParsedGenericInstance(
ci.getClass(), 4, b);
S s = ci.getPromotedValueFromProto(t);
return s;
}
}, sumCallBack);
return sumCallBack.getSumResult();
}
/**
* It computes average while fetching sum and row count from all the
* corresponding regions. Approach is to compute a global sum of region
* level sum and rowcount and then compute the average.
*
* @param tableName
* @param scan
* @throws Throwable
*/
private <R, S, P extends Message, Q extends Message, T extends Message> Pair<S, Long> getAvgArgs(
final TableName tableName,
final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)
throws Throwable {
HTable table = null;
try {
table = new HTable(conf, tableName);
return getAvgArgs(table, ci, scan);
} finally {
if (table != null) {
table.close();
}
}
}
/**
* It computes average while fetching sum and row count from all the
* corresponding regions. Approach is to compute a global sum of region
* level sum and rowcount and then compute the average.
*
* @param table
* @param scan
* @throws Throwable
*/
private <R, S, P extends Message, Q extends Message, T extends Message> Pair<S, Long> getAvgArgs(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
final Scan scan) throws Throwable {
final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);
class AvgCallBack implements Batch.Callback<Pair<S, Long>> {
S sum = null;
Long rowCount = 0l;
public Pair<S, Long> getAvgArgs() {
return new Pair<S, Long>(sum, rowCount);
}
@Override
public synchronized void update(byte[] region, byte[] row,
Pair<S, Long> result) {
sum = ci.add(sum, result.getFirst());
rowCount += result.getSecond();
}
}
AvgCallBack avgCallBack = new AvgCallBack();
table.coprocessorService(AggregateService.class, scan.getStartRow(),
scan.getStopRow(),
new Batch.Call<AggregateService, Pair<S, Long>>() {
@Override
public Pair<S, Long> call(AggregateService instance)
throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();
instance.getAvg(controller, requestArg, rpcCallback);
AggregateResponse response = rpcCallback.get();
if (controller.failedOnException()) {
throw controller.getFailedOn();
}
Pair<S, Long> pair = new Pair<S, Long>(null, 0L);
if (response.getFirstPartCount() == 0) {
return pair;
}
ByteString b = response.getFirstPart(0);
T t = ProtobufUtil.getParsedGenericInstance(
ci.getClass(), 4, b);
S s = ci.getPromotedValueFromProto(t);
pair.setFirst(s);
ByteBuffer bb = ByteBuffer.allocate(8).put(
getBytesFromResponse(response.getSecondPart()));
bb.rewind();
pair.setSecond(bb.getLong());
return pair;
}
}, avgCallBack);
return avgCallBack.getAvgArgs();
}
/**
* This is the client side interface/handle for calling the average method
* for a given cf-cq combination. It was necessary to add one more call
* stack as its return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary
* parameters to compute the average and returs the double value.
*
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> double avg(
final TableName tableName,
final ColumnInterpreter<R, S, P, Q, T> ci, Scan scan)
throws Throwable {
Pair<S, Long> p = getAvgArgs(tableName, ci, scan);
return ci.divideForAvg(p.getFirst(), p.getSecond());
}
/**
* This is the client side interface/handle for calling the average method
* for a given cf-cq combination. It was necessary to add one more call
* stack as its return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary
* parameters to compute the average and returs the double value.
*
* @param table
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> double avg(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
Scan scan) throws Throwable {
Pair<S, Long> p = getAvgArgs(table, ci, scan);
return ci.divideForAvg(p.getFirst(), p.getSecond());
}
/**
* It computes a global standard deviation for a given column and its value.
* Standard deviation is square root of (average of squares -
* average*average). From individual regions, it obtains sum, square sum and
* number of rows. With these, the above values are computed to get the
* global std.
*
* @param table
* @param scan
* @return standard deviations
* @throws Throwable
*/
private <R, S, P extends Message, Q extends Message, T extends Message> Pair<List<S>, Long> getStdArgs(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
final Scan scan) throws Throwable {
final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);
class StdCallback implements Batch.Callback<Pair<List<S>, Long>> {
long rowCountVal = 0l;
S sumVal = null, sumSqVal = null;
public Pair<List<S>, Long> getStdParams() {
List<S> l = new ArrayList<S>();
l.add(sumVal);
l.add(sumSqVal);
Pair<List<S>, Long> p = new Pair<List<S>, Long>(l, rowCountVal);
return p;
}
@Override
public synchronized void update(byte[] region, byte[] row,
Pair<List<S>, Long> result) {
if (result.getFirst().size() > 0) {
sumVal = ci.add(sumVal, result.getFirst().get(0));
sumSqVal = ci.add(sumSqVal, result.getFirst().get(1));
rowCountVal += result.getSecond();
}
}
}
StdCallback stdCallback = new StdCallback();
table.coprocessorService(AggregateService.class, scan.getStartRow(),
scan.getStopRow(),
new Batch.Call<AggregateService, Pair<List<S>, Long>>() {
@Override
public Pair<List<S>, Long> call(AggregateService instance)
throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();
instance.getStd(controller, requestArg, rpcCallback);
AggregateResponse response = rpcCallback.get();
if (controller.failedOnException()) {
throw controller.getFailedOn();
}
Pair<List<S>, Long> pair = new Pair<List<S>, Long>(
new ArrayList<S>(), 0L);
if (response.getFirstPartCount() == 0) {
return pair;
}
List<S> list = new ArrayList<S>();
for (int i = 0; i < response.getFirstPartCount(); i++) {
ByteString b = response.getFirstPart(i);
T t = ProtobufUtil.getParsedGenericInstance(
ci.getClass(), 4, b);
S s = ci.getPromotedValueFromProto(t);
list.add(s);
}
pair.setFirst(list);
ByteBuffer bb = ByteBuffer.allocate(8).put(
getBytesFromResponse(response.getSecondPart()));
bb.rewind();
pair.setSecond(bb.getLong());
return pair;
}
}, stdCallback);
return stdCallback.getStdParams();
}
/**
* This is the client side interface/handle for calling the std method for a
* given cf-cq combination. It was necessary to add one more call stack as
* its return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary
* parameters to compute the std and returns the double value.
*
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> double std(
final TableName tableName, ColumnInterpreter<R, S, P, Q, T> ci,
Scan scan) throws Throwable {
HTable table = null;
try {
table = new HTable(conf, tableName);
return std(table, ci, scan);
} finally {
if (table != null) {
table.close();
}
}
}
/**
* This is the client side interface/handle for calling the std method for a
* given cf-cq combination. It was necessary to add one more call stack as
* its return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary
* parameters to compute the std and returns the double value.
*
* @param table
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> double std(
final HTable table, ColumnInterpreter<R, S, P, Q, T> ci, Scan scan)
throws Throwable {
Pair<List<S>, Long> p = getStdArgs(table, ci, scan);
double res = 0d;
double avg = ci.divideForAvg(p.getFirst().get(0), p.getSecond());
double avgOfSumSq = ci.divideForAvg(p.getFirst().get(1), p.getSecond());
res = avgOfSumSq - (avg) * (avg); // variance
res = Math.pow(res, 0.5);
return res;
}
/**
* It helps locate the region with median for a given column whose weight is
* specified in an optional column. From individual regions, it obtains sum
* of values and sum of weights.
*
* @param table
* @param ci
* @param scan
* @return pair whose first element is a map between start row of the region
* and (sum of values, sum of weights) for the region, the second
* element is (sum of values, sum of weights) for all the regions
* chosen
* @throws Throwable
*/
private <R, S, P extends Message, Q extends Message, T extends Message> Pair<NavigableMap<byte[], List<S>>, List<S>> getMedianArgs(
final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,
final Scan scan) throws Throwable {
final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);
final NavigableMap<byte[], List<S>> map = new TreeMap<byte[], List<S>>(
Bytes.BYTES_COMPARATOR);
class StdCallback implements Batch.Callback<List<S>> {
S sumVal = null, sumWeights = null;
public Pair<NavigableMap<byte[], List<S>>, List<S>> getMedianParams() {
List<S> l = new ArrayList<S>();
l.add(sumVal);
l.add(sumWeights);
Pair<NavigableMap<byte[], List<S>>, List<S>> p = new Pair<NavigableMap<byte[], List<S>>, List<S>>(
map, l);
return p;
}
@Override
public synchronized void update(byte[] region, byte[] row,
List<S> result) {
map.put(row, result);
sumVal = ci.add(sumVal, result.get(0));
sumWeights = ci.add(sumWeights, result.get(1));
}
}
StdCallback stdCallback = new StdCallback();
table.coprocessorService(AggregateService.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateService, List<S>>() {
@Override
public List<S> call(AggregateService instance)
throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();
instance.getMedian(controller, requestArg, rpcCallback);
AggregateResponse response = rpcCallback.get();
if (controller.failedOnException()) {
throw controller.getFailedOn();
}
List<S> list = new ArrayList<S>();
for (int i = 0; i < response.getFirstPartCount(); i++) {
ByteString b = response.getFirstPart(i);
T t = ProtobufUtil.getParsedGenericInstance(
ci.getClass(), 4, b);
S s = ci.getPromotedValueFromProto(t);
list.add(s);
}
return list;
}
}, stdCallback);
return stdCallback.getMedianParams();
}
/**
* This is the client side interface/handler for calling the median method
* for a given cf-cq combination. This method collects the necessary
* parameters to compute the median and returns the median.
*
* @param tableName
* @param ci
* @param scan
* @return R the median
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> R median(
final TableName tableName, ColumnInterpreter<R, S, P, Q, T> ci,
Scan scan) throws Throwable {
HTable table = null;
try {
table = new HTable(conf, tableName);
return median(table, ci, scan);
} finally {
if (table != null) {
table.close();
}
}
}
/**
* This is the client side interface/handler for calling the median method
* for a given cf-cq combination. This method collects the necessary
* parameters to compute the median and returns the median.
*
* @param table
* @param ci
* @param scan
* @return R the median
* @throws Throwable
*/
public <R, S, P extends Message, Q extends Message, T extends Message> R median(
final HTable table, ColumnInterpreter<R, S, P, Q, T> ci, Scan scan)
throws Throwable {
Pair<NavigableMap<byte[], List<S>>, List<S>> p = getMedianArgs(table,
ci, scan);
byte[] startRow = null;
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> quals = scan.getFamilyMap().get(colFamily);
NavigableMap<byte[], List<S>> map = p.getFirst();
S sumVal = p.getSecond().get(0);
S sumWeights = p.getSecond().get(1);
double halfSumVal = ci.divideForAvg(sumVal, 2L);
double movingSumVal = 0;
boolean weighted = false;
if (quals.size() > 1) {
weighted = true;
halfSumVal = ci.divideForAvg(sumWeights, 2L);
}
for (Map.Entry<byte[], List<S>> entry : map.entrySet()) {
S s = weighted ? entry.getValue().get(1) : entry.getValue().get(0);
double newSumVal = movingSumVal + ci.divideForAvg(s, 1L);
if (newSumVal > halfSumVal)
break; // we found the region with the median
movingSumVal = newSumVal;
startRow = entry.getKey();
}
// scan the region with median and find it
Scan scan2 = new Scan(scan);
// inherit stop row from method parameter
if (startRow != null)
scan2.setStartRow(startRow);
ResultScanner scanner = null;
try {
int cacheSize = scan2.getCaching();
if (!scan2.getCacheBlocks() || scan2.getCaching() < 2) {
scan2.setCacheBlocks(true);
cacheSize = 5;
scan2.setCaching(cacheSize);
}
scanner = table.getScanner(scan2);
Result[] results = null;
byte[] qualifier = quals.pollFirst();
// qualifier for the weight column
byte[] weightQualifier = weighted ? quals.pollLast() : qualifier;
R value = null;
do {
results = scanner.next(cacheSize);
if (results != null && results.length > 0) {
for (int i = 0; i < results.length; i++) {
Result r = results[i];
// retrieve weight
Cell kv = r.getColumnLatest(colFamily, weightQualifier);
R newValue = ci
.getValue(colFamily, weightQualifier, kv);
S s = ci.castToReturnType(newValue);
double newSumVal = movingSumVal
+ ci.divideForAvg(s, 1L);
// see if we have moved past the median
if (newSumVal > halfSumVal) {
return value;
}
movingSumVal = newSumVal;
kv = r.getColumnLatest(colFamily, qualifier);
value = ci.getValue(colFamily, qualifier, kv);
}
}
} while (results != null && results.length > 0);
} finally {
if (scanner != null) {
scanner.close();
}
}
return null;
}
<R, S, P extends Message, Q extends Message, T extends Message> AggregateRequest validateArgAndGetPB(
Scan scan, ColumnInterpreter<R, S, P, Q, T> ci,
boolean canFamilyBeAbsent) throws IOException {
validateParameters(scan, canFamilyBeAbsent);
final AggregateRequest.Builder requestBuilder = AggregateRequest
.newBuilder();
requestBuilder
.setInterpreterClassName(ci.getClass().getCanonicalName());
P columnInterpreterSpecificData = null;
if ((columnInterpreterSpecificData = ci.getRequestData()) != null) {
requestBuilder
.setInterpreterSpecificBytes(columnInterpreterSpecificData
.toByteString());
}
requestBuilder.setScan(ProtobufUtil.toScan(scan));
return requestBuilder.build();
}
byte[] getBytesFromResponse(ByteString response) {
ByteBuffer bb = response.asReadOnlyByteBuffer();
bb.rewind();
byte[] bytes;
if (bb.hasArray()) {
bytes = bb.array();
} else {
bytes = response.toByteArray();
}
return bytes;
}
}
具体调用方法:
1 首先将这两个类打成jar包,上传到hdfs上去,比如我上传到了 hdfs://master68:8020/sharelib/aggregate.jar
2 将该协处理器设置到某个具体的表上,我这里假设设置在userinfo表上
public static void main(String[] args) throws MasterNotRunningException,
Exception {
// TODO Auto-generated method stub
byte[] tableName = Bytes.toBytes("userinfo");
Configuration conf = HBaseConfiguration.create();
HBaseAdmin admin = new HBaseAdmin(conf);
admin.disableTable(tableName);
HTableDescriptor htd = admin.getTableDescriptor(tableName);
htd.addCoprocessor(AggregateImplementation.class.getName(), new Path("hdfs://master68:8020/sharelib/aggregate.jar"), 1001, null);
//htd.removeCoprocessor(RowCountEndpoint.class.getName());
admin.modifyTable(tableName, htd);
admin.enableTable(tableName);
admin.close();
}
3 写个测试类测试一下:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.coprocessor.LongColumnInterpreter;
public class TestAggregationClient {
private static Configuration hbaseConfig = null;
static {
hbaseConfig = HBaseConfiguration.create();
}
public static void main(String[] args) throws Throwable {
AggregationClient client = new AggregationClient(hbaseConfig);
final Scan scan;
scan = new Scan();
Long count = client.rowCount(TableName.valueOf("userinfo"),
new LongColumnInterpreter(), scan);
System.out.println(count);
}
}