phoenix local index的使用和join table的原理
下面分析一下索引的协处理器的代码流程
UngroupedAggregateRegionObserver.doPostScannerOpen
//拿出在scan属性中的数据
byte[] localIndexBytes = scan.getAttribute(LOCAL_INDEX_BUILD);
List<IndexMaintainer> indexMaintainers = localIndexBytes == null ? null : IndexMaintainer.deserialize(localIndexBytes);
List<Mutation> indexMutations = localIndexBytes == null ? Collections.<Mutation>emptyList() : Lists.<Mutation>newArrayListWithExpectedSize(1024);
RegionScanner theScanner = s;
byte[] indexUUID = scan.getAttribute(PhoenixIndexCodec.INDEX_UUID);
List<Expression> selectExpressions = null;
byte[] upsertSelectTable = scan.getAttribute(BaseScannerRegionObserver.UPSERT_SELECT_TABLE);
boolean isUpsert = false;
boolean isDelete = false;
byte[] deleteCQ = null;
byte[] deleteCF = null;
byte[] emptyCF = null;
ImmutableBytesWritable ptr = new ImmutableBytesWritable();
if (upsertSelectTable != null) {
isUpsert = true;
projectedTable = deserializeTable(upsertSelectTable);
selectExpressions = deserializeExpressions(scan.getAttribute(BaseScannerRegionObserver.UPSERT_SELECT_EXPRS));
values = new byte[projectedTable.getPKColumns().size()][];
} else {
byte[] isDeleteAgg = scan.getAttribute(BaseScannerRegionObserver.DELETE_AGG);
isDelete = isDeleteAgg != null && Bytes.compareTo(PDataType.TRUE_BYTES, isDeleteAgg) == 0;
if (!isDelete) {
deleteCF = scan.getAttribute(BaseScannerRegionObserver.DELETE_CF);
deleteCQ = scan.getAttribute(BaseScannerRegionObserver.DELETE_CQ);
}
emptyCF = scan.getAttribute(BaseScannerRegionObserver.EMPTY_CF);
}
TupleProjector tupleProjector = null;
Region dataRegion = null;
byte[][] viewConstants = null;
ColumnReference[] dataColumns = IndexUtil.deserializeDataTableColumnsToJoin(scan);
boolean localIndexScan = ScanUtil.isLocalIndex(scan);
final TupleProjector p = TupleProjector.deserializeProjectorFromScan(scan);
final HashJoinInfo j = HashJoinInfo.deserializeHashJoinFromScan(scan);
//判断是否是本地索引
if ((localIndexScan && !isDelete && !isDescRowKeyOrderUpgrade) || (j == null && p != null)) {
if (dataColumns != null) {
tupleProjector = IndexUtil.getTupleProjector(scan, dataColumns);
dataRegion = IndexUtil.getDataRegion(env);
viewConstants = IndexUtil.deserializeViewConstantsFromScan(scan);
}
ImmutableBytesWritable tempPtr = new ImmutableBytesWritable();
//组建Scan对象
theScanner =
getWrappedScanner(c, theScanner, offset, scan, dataColumns, tupleProjector,
dataRegion, indexMaintainers == null ? null : indexMaintainers.get(0), viewConstants, p, tempPtr);
}
if (j != null) {
theScanner = new HashJoinRegionScanner(theScanner, p, j, ScanUtil.getTenantId(scan), env);
}
从上面的方法中,可以看到,判断是否是本地索引,然后进行组建针对本地索引的scan对象
下面到 getWrappedScanner 方法中进行分析
protected RegionScanner getWrappedScanner(final ObserverContext<RegionCoprocessorEnvironment> c,
final RegionScanner s, final Set<KeyValueColumnExpression> arrayKVRefs,
final Expression[] arrayFuncRefs, final int offset, final Scan scan,
final ColumnReference[] dataColumns, final TupleProjector tupleProjector,
final Region dataRegion, final IndexMaintainer indexMaintainer,
Transaction tx,
final byte[][] viewConstants, final KeyValueSchema kvSchema,
final ValueBitSet kvSchemaBitSet, final TupleProjector projector,
final ImmutableBytesWritable ptr) {
return new RegionScanner() {
@Override
public boolean next(List<Cell> results) throws IOException {
try {
return s.next(results);
} catch (Throwable t) {
ServerUtil.throwIOException(c.getEnvironment().getRegion().getRegionInfo().getRegionNameAsString(), t);
return false; // impossible
}
}
@Override
public boolean next(List<Cell> result, ScannerContext scannerContext) throws IOException {
try {
return s.next(result, scannerContext);
} catch (Throwable t) {
ServerUtil.throwIOException(c.getEnvironment().getRegion().getRegionInfo().getRegionNameAsString(), t);
return false; // impossible
}
}
@Override
public void close() throws IOException {
s.close();
}
@Override
public HRegionInfo getRegionInfo() {
return s.getRegionInfo();
}
@Override
public boolean isFilterDone() throws IOException {
return s.isFilterDone();
}
@Override
public boolean reseek(byte[] row) throws IOException {
return s.reseek(row);
}
@Override
public long getMvccReadPoint() {
return s.getMvccReadPoint();
}
@Override
public boolean nextRaw(List<Cell> result) throws IOException {
try {
boolean next = s.nextRaw(result);
Cell arrayElementCell = null;
if (result.size() == 0) {
return next;
}
if (arrayFuncRefs != null && arrayFuncRefs.length > 0 && arrayKVRefs.size() > 0) {
int arrayElementCellPosition = replaceArrayIndexElement(arrayKVRefs, arrayFuncRefs, result);
arrayElementCell = result.get(arrayElementCellPosition);
}
//判断是否是本地索引
if (ScanUtil.isLocalIndex(scan) && !ScanUtil.isAnalyzeTable(scan)) {
//把索引对象数据转成物理数据主键
IndexUtil.wrapResultUsingOffset(c, result, offset, dataColumns,
tupleProjector, dataRegion, indexMaintainer, viewConstants, ptr);
}
if (projector != null) {
Tuple tuple = projector.projectResults(new ResultTuple(Result.create(result)));
result.clear();
result.add(tuple.getValue(0));
if(arrayElementCell != null)
result.add(arrayElementCell);
}
// There is a scanattribute set to retrieve the specific array element
return next;
} catch (Throwable t) {
ServerUtil.throwIOException(c.getEnvironment().getRegion().getRegionInfo().getRegionNameAsString(), t);
return false; // impossible
}
}
@Override
public boolean nextRaw(List<Cell> result, ScannerContext scannerContext)
throws IOException {
try {
boolean next = s.nextRaw(result, scannerContext);
Cell arrayElementCell = null;
if (result.size() == 0) {
return next;
}
if (arrayFuncRefs != null && arrayFuncRefs.length > 0 && arrayKVRefs.size() > 0) {
int arrayElementCellPosition = replaceArrayIndexElement(arrayKVRefs, arrayFuncRefs, result);
arrayElementCell = result.get(arrayElementCellPosition);
}
//判断是否是本地索引
if ((offset > 0 || ScanUtil.isLocalIndex(scan)) && !ScanUtil.isAnalyzeTable(scan)) {
//把索引对象数据转成物理数据主键
IndexUtil.wrapResultUsingOffset(c, result, offset, dataColumns,
tupleProjector, dataRegion, indexMaintainer, viewConstants, ptr);
}
if (projector != null) {
Tuple tuple = projector.projectResults(new ResultTuple(Result.create(result)));
result.clear();
result.add(tuple.getValue(0));
if(arrayElementCell != null)
result.add(arrayElementCell);
}
// There is a scanattribute set to retrieve the specific array element
return next;
} catch (Throwable t) {
ServerUtil.throwIOException(c.getEnvironment().getRegion().getRegionInfo().getRegionNameAsString(), t);
return false; // impossible
}
}
private int replaceArrayIndexElement(final Set<KeyValueColumnExpression> arrayKVRefs,
final Expression[] arrayFuncRefs, List<Cell> result) {
// make a copy of the results array here, as we're modifying it below
MultiKeyValueTuple tuple = new MultiKeyValueTuple(ImmutableList.copyOf(result));
// The size of both the arrays would be same?
// Using KeyValueSchema to set and retrieve the value
// collect the first kv to get the row
Cell rowKv = result.get(0);
for (KeyValueColumnExpression kvExp : arrayKVRefs) {
if (kvExp.evaluate(tuple, ptr)) {
for (int idx = tuple.size() - 1; idx >= 0; idx--) {
Cell kv = tuple.getValue(idx);
if (Bytes.equals(kvExp.getColumnFamily(), 0, kvExp.getColumnFamily().length,
kv.getFamilyArray(), kv.getFamilyOffset(), kv.getFamilyLength())
&& Bytes.equals(kvExp.getColumnName(), 0, kvExp.getColumnName().length,
kv.getQualifierArray(), kv.getQualifierOffset(), kv.getQualifierLength())) {
// remove the kv that has the full array values.
result.remove(idx);
break;
}
}
}
}
byte[] value = kvSchema.toBytes(tuple, arrayFuncRefs,
kvSchemaBitSet, ptr);
// Add a dummy kv with the exact value of the array index
result.add(new KeyValue(rowKv.getRowArray(), rowKv.getRowOffset(), rowKv.getRowLength(),
QueryConstants.ARRAY_VALUE_COLUMN_FAMILY, 0, QueryConstants.ARRAY_VALUE_COLUMN_FAMILY.length,
QueryConstants.ARRAY_VALUE_COLUMN_QUALIFIER, 0,
QueryConstants.ARRAY_VALUE_COLUMN_QUALIFIER.length, HConstants.LATEST_TIMESTAMP,
Type.codeToType(rowKv.getTypeByte()), value, 0, value.length));
return result.size() - 1;
}
@Override
public long getMaxResultSize() {
return s.getMaxResultSize();
}
@Override
public int getBatch() {
return s.getBatch();
}
};
}
上面方法中明显组建了 RegionScanner 对象,里面就是一个iterator的对象,然后在拿取下一条数据的过程上
判断是否是本地索引,如果是就通过本地索引的主键到拿物理数据的主键进行查询数据
然后就调用到索引到数据的核心转换方法当中
IndexUtil.wrapResultUsingOffset
public static void wrapResultUsingOffset(final ObserverContext<RegionCoprocessorEnvironment> c,
List<Cell> result, final int offset, ColumnReference[] dataColumns,
TupleProjector tupleProjector, Region dataRegion, IndexMaintainer indexMaintainer,
byte[][] viewConstants, ImmutableBytesWritable ptr) throws IOException {
if (tupleProjector != null) {
// Join back to data table here by issuing a local get projecting
// all of the cq:cf from the KeyValueColumnExpression into the Get.
Cell firstCell = result.get(0);
byte[] indexRowKey = firstCell.getRowArray();//拿到索引的rowkey
ptr.set(indexRowKey, firstCell.getRowOffset() + offset, firstCell.getRowLength() - offset);
//对索引表集群 中进行查询
byte[] dataRowKey = indexMaintainer.buildDataRowKey(ptr, viewConstants);
Get get = new Get(dataRowKey);
for (int i = 0; i < dataColumns.length; i++) {
get.addColumn(dataColumns[i].getFamily(), dataColumns[i].getQualifier());
}
Result joinResult = null;
if (dataRegion != null) {
joinResult = dataRegion.get(get);
} else {
TableName dataTable =
TableName.valueOf(MetaDataUtil.getUserTableName(c.getEnvironment()
.getRegion().getTableDesc().getNameAsString()));
HTableInterface table = null;
try {
table = c.getEnvironment().getTable(dataTable);
joinResult = table.get(get);
} finally {
if (table != null) table.close();
}
}
从上面的源码可以看到, indexRowKey 就是一 个索引的主键,然后通过 indexMaintainer.buildDataRowKey 方法的调用
转到了dataRowKey,然后又到数据物理表进行查询数据,上面是一个公共的方法。
可以看到原先的方法中还有一个对象 HashJoinRegionScanner ,里面就是对表进行关联用的
public HashJoinRegionScanner(RegionScanner scanner, TupleProjector projector, HashJoinInfo joinInfo, ImmutableBytesWritable tenantId, RegionCoprocessorEnvironment env) throws IOException {
this.env = env;
this.scanner = scanner;//原生的scan对象
this.projector = projector;
this.joinInfo = joinInfo; //关联的相关信息
this.resultQueue = new LinkedList<Tuple>();
this.hasMore = true;
this.count = 0;
this.limit = Long.MAX_VALUE;
for (JoinType type : joinInfo.getJoinTypes()) {
if (type != JoinType.Inner && type != JoinType.Left && type != JoinType.Semi && type != JoinType.Anti)
throw new DoNotRetryIOException("Got join type '" + type + "'. Expect only INNER or LEFT with hash-joins.");
}
if (joinInfo.getLimit() != null) {
this.limit = joinInfo.getLimit();
}
//关联的字段信息
int count = joinInfo.getJoinIds().length;
this.tempTuples = new List[count];
this.hashCaches = new HashCache[count];
this.tempSrcBitSet = new ValueBitSet[count];
TenantCache cache = GlobalCache.getTenantCache(env, tenantId);
for (int i = 0; i < count; i++) {
ImmutableBytesPtr joinId = joinInfo.getJoinIds()[i];
if (joinId.getLength() == 0) { // semi-join optimized into skip-scan
hashCaches[i] = null;
tempSrcBitSet[i] = null;
tempTuples[i] = null;
continue;
}
HashCache hashCache = (HashCache)cache.getServerCache(joinId);
if (hashCache == null)
throw new DoNotRetryIOException("Could not find hash cache for joinId: "
+ Bytes.toString(joinId.get(), joinId.getOffset(), joinId.getLength())
+ ". The cache might have expired and have been removed.");
hashCaches[i] = hashCache;
tempSrcBitSet[i] = ValueBitSet.newInstance(joinInfo.getSchemas()[i]);
}
if (this.projector != null) {
this.tempDestBitSet = ValueBitSet.newInstance(joinInfo.getJoinedSchema());
this.projector.setValueBitSet(tempDestBitSet);
}
}
上面就是创建join的对象,里面有关联查询的相关信息,如果关联方式,关联字段等。
public boolean nextRaw(List<Cell> result) throws IOException {
try {
while (shouldAdvance()) {
hasMore = scanner.nextRaw(result);
processResults(result, false);
result.clear();
}
return nextInQueue(result);
} catch (Throwable t) {
ServerUtil.throwIOException(env.getRegion().getRegionInfo().getRegionNameAsString(), t);
return false; // impossible
}
}
在 HashJoinRegionScanner 的上面的方法当中,拿出关联的下一条数据时,调用到了上面的方法
private void processResults(List<Cell> result, boolean hasBatchLimit) throws IOException {
if (result.isEmpty())
return;
//传进来的tuple对象
Tuple tuple = new ResultTuple(Result.create(result));
// For backward compatibility. In new versions, HashJoinInfo.forceProjection()
// always returns true.
if (joinInfo.forceProjection()) {
tuple = projector.projectResults(tuple);
}
// TODO: fix below Scanner.next() and Scanner.nextRaw() methods as well.
if (hasBatchLimit)
throw new UnsupportedOperationException("Cannot support join operations in scans with limit");
int count = joinInfo.getJoinIds().length;
boolean cont = true;
for (int i = 0; i < count; i++) {
if (!(joinInfo.earlyEvaluation()[i]) || hashCaches[i] == null)
continue;
ImmutableBytesPtr key = TupleUtil.getConcatenatedValue(tuple, joinInfo.getJoinExpressions()[i]);
tempTuples[i] = hashCaches[i].get(key);
JoinType type = joinInfo.getJoinTypes()[i];
if (((type == JoinType.Inner || type == JoinType.Semi) && tempTuples[i] == null)
|| (type == JoinType.Anti && tempTuples[i] != null)) {
cont = false;
break;
}
}
if (cont) {
if (projector == null) {
int dup = 1;
for (int i = 0; i < count; i++) {
dup *= (tempTuples[i] == null ? 1 : tempTuples[i].size());
}
for (int i = 0; i < dup; i++) {
resultQueue.offer(tuple);
}
} else {
KeyValueSchema schema = joinInfo.getJoinedSchema();
if (!joinInfo.forceProjection()) { // backward compatibility
tuple = projector.projectResults(tuple);
}
resultQueue.offer(tuple);
for (int i = 0; i < count; i++) {
boolean earlyEvaluation = joinInfo.earlyEvaluation()[i];
JoinType type = joinInfo.getJoinTypes()[i];
if (earlyEvaluation && (type == JoinType.Semi || type == JoinType.Anti))
continue;
int j = resultQueue.size();
while (j-- > 0) {
Tuple lhs = resultQueue.poll();
if (!earlyEvaluation) {
ImmutableBytesPtr key = TupleUtil.getConcatenatedValue(lhs, joinInfo.getJoinExpressions()[i]);
tempTuples[i] = hashCaches[i].get(key);
if (tempTuples[i] == null) {
if (type == JoinType.Inner || type == JoinType.Semi) {
continue;
} else if (type == JoinType.Anti) {
resultQueue.offer(lhs);
continue;
}
}
}
if (tempTuples[i] == null) {
Tuple joined = tempSrcBitSet[i] == ValueBitSet.EMPTY_VALUE_BITSET ?
lhs : TupleProjector.mergeProjectedValue(
(ProjectedValueTuple) lhs, schema, tempDestBitSet,
null, joinInfo.getSchemas()[i], tempSrcBitSet[i],
joinInfo.getFieldPositions()[i]);
resultQueue.offer(joined);
continue;
}
for (Tuple t : tempTuples[i]) {
Tuple joined = tempSrcBitSet[i] == ValueBitSet.EMPTY_VALUE_BITSET ?
lhs : TupleProjector.mergeProjectedValue(
(ProjectedValueTuple) lhs, schema, tempDestBitSet,
t, joinInfo.getSchemas()[i], tempSrcBitSet[i],
joinInfo.getFieldPositions()[i]);
resultQueue.offer(joined);
}
}
}
}
// apply post-join filter 执行过滤表达式
Expression postFilter = joinInfo.getPostJoinFilterExpression();
if (postFilter != null) {
for (Iterator<Tuple> iter = resultQueue.iterator(); iter.hasNext();) {
Tuple t = iter.next();
postFilter.reset();
ImmutableBytesWritable tempPtr = new ImmutableBytesWritable();
try {
if (!postFilter.evaluate(t, tempPtr)) {
iter.remove();
continue;
}
} catch (IllegalDataException e) {
iter.remove();
continue;
}
Boolean b = (Boolean)postFilter.getDataType().toObject(tempPtr);
if (!b.booleanValue()) {
iter.remove();
}
}
}
}
}
在上面的方法中,进行数据的过滤判断,看是否和表达式相等。
总结:一个本地索引的使用,其实就是先去索引表进行查询,然后拿到主键后,再去物理数据表进行查询