substrait plan 入口
override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = {
ExecutorManager.tryTaskSet(numaBindingInfo)
val inputPartition = castNativePartition(split)
if (rdds.isEmpty) {
BackendsApiManager.getIteratorApiInstance.genFirstStageIterator(
inputPartition,
loadNative,
outputAttributes,
context,
pipelineTime,
updateMetrics,
updateNativeMetrics)
} else {
val partitions = split.asInstanceOf[FirstZippedPartitionsPartition].partitions
val inputIterators = (rdds zip partitions).map {
case (rdd, partition) => rdd.iterator(partition, context)
}
BackendsApiManager.getIteratorApiInstance.genFirstStageIterator(
inputPartition,
loadNative,
outputAttributes,
context,
pipelineTime,
updateMetrics,
updateNativeMetrics,
inputIterators)
}
}
ArrowWritableColumnVector.java
recordbatch => ArrowWritableColumnVector
public static ArrowWritableColumnVector[] loadColumns(int capacity, Schema arrowSchema,
ArrowRecordBatch recordBatch,
BufferAllocator allocator) {
VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, allocator);
VectorLoader loader = new VectorLoader(root);
loader.load(recordBatch);
return loadColumns(capacity, root.getFieldVectors());
}
ArrowColumnarBatches.java
offload
public static ColumnarBatch offload(BufferAllocator allocator, ColumnarBatch input) {
if (!isArrowColumnarBatch(input)) {
throw new IllegalArgumentException("batch is not Arrow columnar batch");
}
if (input.numCols() == 0) {
return input;
}
try (ArrowArray cArray = ArrowArray.allocateNew(allocator);
ArrowSchema cSchema = ArrowSchema.allocateNew(allocator)) {
GlutenArrowAbiUtil.exportFromSparkColumnarBatch(
ArrowBufferAllocators.contextInstance(), input, cSchema, cArray);
long handle = ColumnarBatchJniWrapper.INSTANCE.createWithArrowArray(cSchema.memoryAddress(),
cArray.memoryAddress());
ColumnarBatch output = GlutenColumnarBatches.create(handle);
// Follow input's reference count. This might be optimized using
// automatic clean-up or once the extensibility of ColumnarBatch is enriched
long refCnt = -1L;
for (int i = 0; i < input.numCols(); i++) {
ArrowWritableColumnVector col = ((ArrowWritableColumnVector) input.column(i));
long colRefCnt = col.refCnt();
if (refCnt == -1L) {
refCnt = colRefCnt;
} else {
if (colRefCnt != refCnt) {
throw new IllegalStateException();
}
}
}
if (refCnt == -1L) {
throw new IllegalStateException();
}
final GlutenIndicatorVector giv = (GlutenIndicatorVector) output.column(0);
for (long i = 0; i < (refCnt - 1); i++) {
giv.retain();
}
// close the input one
for (long i = 0; i < refCnt; i++) {
input.close();
}
// populate new vectors to input
transferVectors(output, input);
return input;
}
}
JNI ArrowCStructColumnarBatch
JNIEXPORT jlong JNICALL Java_io_glutenproject_columnarbatch_ColumnarBatchJniWrapper_createWithArrowArray(
JNIEnv* env,
jobject,
jlong c_schema,
jlong c_array) {
JNI_METHOD_START
std::unique_ptr<ArrowSchema> target_schema = std::make_unique<ArrowSchema>();
std::unique_ptr<ArrowArray> target_array = std::make_unique<ArrowArray>();
auto* arrow_schema = reinterpret_cast<ArrowSchema*>(c_schema);
auto* arrow_array = reinterpret_cast<ArrowArray*>(c_array);
ArrowArrayMove(arrow_array, target_array.get());
ArrowSchemaMove(arrow_schema, target_schema.get());
std::shared_ptr<ColumnarBatch> batch =
std::make_shared<ArrowCStructColumnarBatch>(std::move(target_schema), std::move(target_array));
return gluten_columnarbatch_holder_.Insert(batch);
JNI_METHOD_END(-1L)
}
export RecordBatch
def exportFromSparkColumnarBatch(allocator: BufferAllocator, columnarBatch: ColumnarBatch,
cSchema: ArrowSchema, cArray: ArrowArray): Unit = {
val loaded = ArrowColumnarBatches.ensureLoaded(allocator, columnarBatch)
val schema = GlutenArrowUtil.toSchema(loaded)
val rb = GlutenArrowUtil.createArrowRecordBatch(loaded)
try {
exportFromArrowRecordBatch(allocator, rb, schema, cSchema, cArray)
} finally {
GlutenArrowUtil.releaseArrowRecordBatch(rb)
}
}
ArrowInIterator.java
ublic long next() {
final ColumnarBatch batch = nextColumnarBatch();
final ColumnarBatch offloaded =
ArrowColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), batch);
return GlutenColumnarBatches.getNativeHandle(offloaded);
}