Spark2 code

zhixingheyi_tian

已于 2023-02-15 10:40:04 修改

阅读量109

点赞数

分类专栏： spark 文章标签：大数据

于 2022-11-23 15:19:24 首次发布

本文链接：https://blog.csdn.net/zhixingheyi_tian/article/details/128001070

版权

spark 专栏收录该内容

107 篇文章 4 订阅

订阅专栏

substrait plan 入口

  override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = {
    ExecutorManager.tryTaskSet(numaBindingInfo)

    val inputPartition = castNativePartition(split)
    if (rdds.isEmpty) {
      BackendsApiManager.getIteratorApiInstance.genFirstStageIterator(
        inputPartition,
        loadNative,
        outputAttributes,
        context,
        pipelineTime,
        updateMetrics,
        updateNativeMetrics)
    } else {
      val partitions = split.asInstanceOf[FirstZippedPartitionsPartition].partitions
      val inputIterators = (rdds zip partitions).map {
        case (rdd, partition) => rdd.iterator(partition, context)
      }
      BackendsApiManager.getIteratorApiInstance.genFirstStageIterator(
        inputPartition,
        loadNative,
        outputAttributes,
        context,
        pipelineTime,
        updateMetrics,
        updateNativeMetrics,
        inputIterators)
    }
  }

ArrowWritableColumnVector.java

recordbatch => ArrowWritableColumnVector

public static ArrowWritableColumnVector[] loadColumns(int capacity, Schema arrowSchema,
                                                        ArrowRecordBatch recordBatch,
                                                        BufferAllocator allocator) {
    VectorSchemaRoot root = VectorSchemaRoot.create(arrowSchema, allocator);
    VectorLoader loader = new VectorLoader(root);
    loader.load(recordBatch);
    return loadColumns(capacity, root.getFieldVectors());
  }

ArrowColumnarBatches.java

offload

  public static ColumnarBatch offload(BufferAllocator allocator, ColumnarBatch input) {
    if (!isArrowColumnarBatch(input)) {
      throw new IllegalArgumentException("batch is not Arrow columnar batch");
    }
    if (input.numCols() == 0) {
      return input;
    }
    try (ArrowArray cArray = ArrowArray.allocateNew(allocator);
         ArrowSchema cSchema = ArrowSchema.allocateNew(allocator)) {
      GlutenArrowAbiUtil.exportFromSparkColumnarBatch(
          ArrowBufferAllocators.contextInstance(), input, cSchema, cArray);
      long handle = ColumnarBatchJniWrapper.INSTANCE.createWithArrowArray(cSchema.memoryAddress(),
          cArray.memoryAddress());
      ColumnarBatch output = GlutenColumnarBatches.create(handle);

      // Follow input's reference count. This might be optimized using
      // automatic clean-up or once the extensibility of ColumnarBatch is enriched
      long refCnt = -1L;
      for (int i = 0; i < input.numCols(); i++) {
        ArrowWritableColumnVector col = ((ArrowWritableColumnVector) input.column(i));
        long colRefCnt = col.refCnt();
        if (refCnt == -1L) {
          refCnt = colRefCnt;
        } else {
          if (colRefCnt != refCnt) {
            throw new IllegalStateException();
          }
        }
      }
      if (refCnt == -1L) {
        throw new IllegalStateException();
      }
      final GlutenIndicatorVector giv = (GlutenIndicatorVector) output.column(0);
      for (long i = 0; i < (refCnt - 1); i++) {
        giv.retain();
      }

      // close the input one
      for (long i = 0; i < refCnt; i++) {
        input.close();
      }

      // populate new vectors to input
      transferVectors(output, input);
      return input;
    }
  }

JNI ArrowCStructColumnarBatch

JNIEXPORT jlong JNICALL Java_io_glutenproject_columnarbatch_ColumnarBatchJniWrapper_createWithArrowArray(
    JNIEnv* env,
    jobject,
    jlong c_schema,
    jlong c_array) {
  JNI_METHOD_START
  std::unique_ptr<ArrowSchema> target_schema = std::make_unique<ArrowSchema>();
  std::unique_ptr<ArrowArray> target_array = std::make_unique<ArrowArray>();
  auto* arrow_schema = reinterpret_cast<ArrowSchema*>(c_schema);
  auto* arrow_array = reinterpret_cast<ArrowArray*>(c_array);
  ArrowArrayMove(arrow_array, target_array.get());
  ArrowSchemaMove(arrow_schema, target_schema.get());
  std::shared_ptr<ColumnarBatch> batch =
      std::make_shared<ArrowCStructColumnarBatch>(std::move(target_schema), std::move(target_array));
  return gluten_columnarbatch_holder_.Insert(batch);
  JNI_METHOD_END(-1L)
}

export RecordBatch

  def exportFromSparkColumnarBatch(allocator: BufferAllocator, columnarBatch: ColumnarBatch,
                                   cSchema: ArrowSchema, cArray: ArrowArray): Unit = {
    val loaded = ArrowColumnarBatches.ensureLoaded(allocator, columnarBatch)
    val schema = GlutenArrowUtil.toSchema(loaded)
    val rb = GlutenArrowUtil.createArrowRecordBatch(loaded)
    try {
      exportFromArrowRecordBatch(allocator, rb, schema, cSchema, cArray)
    } finally {
      GlutenArrowUtil.releaseArrowRecordBatch(rb)
    }
  }

ArrowInIterator.java

ublic long next() {
    final ColumnarBatch batch = nextColumnarBatch();
    final ColumnarBatch offloaded =
        ArrowColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), batch);
    return GlutenColumnarBatches.getNativeHandle(offloaded);
  }