Arrow parquet library

zhixingheyi_tian

已于 2022-08-23 11:47:20 修改

阅读量279

点赞数

分类专栏：云计算文章标签： c++

于 2022-08-12 10:22:11 首次发布

本文链接：https://blog.csdn.net/zhixingheyi_tian/article/details/126298157

版权

云计算专栏收录该内容

92 篇文章 0 订阅

订阅专栏

testing driver

主要涉及两个类 FileReader，RecordBatchReader
RecordBatchReader 实例对象通过 arquet_reader->GetRecordBatchReader 获取

 std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
      std::shared_ptr<arrow::RecordBatchReader> record_batch_reader;
      ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
          ::arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
          properties, &parquet_reader));

      std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
      ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
          row_group_indices, local_column_indices, &record_batch_reader));
      do {
        TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));

        if (record_batch) {
          // batches.push_back(record_batch);
          num_batches += 1;
          num_rows += record_batch->num_rows();
        }
      } while (record_batch);

GetRecordBatchReader

reader.cc

Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
                                            const std::vector<int>& column_indices,
                                            std::unique_ptr<RecordBatchReader>* out) {
  RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));

  if (reader_properties_.pre_buffer()) {
    // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
    BEGIN_PARQUET_CATCH_EXCEPTIONS
    ARROW_UNUSED(reader_->PreBuffer(row_groups, column_indices,
                                    reader_properties_.io_context(),
                                    reader_properties_.cache_options()));
    END_PARQUET_CATCH_EXCEPTIONS
  }

  std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
  std::shared_ptr<::arrow::Schema> batch_schema;
  RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));

  if (readers.empty()) {
    // Just generate all batches right now; they're cheap since they have no columns.
    int64_t batch_size = properties().batch_size();
    auto max_sized_batch =
        ::arrow::RecordBatch::Make(batch_schema, batch_size, ::arrow::ArrayVector{});

    ::arrow::RecordBatchVector batches;

    for (int row_group : row_groups) {
      int64_t num_rows = parquet_reader()->metadata()->RowGroup(row_group)->num_rows();

      batches.insert(batches.end(), num_rows / batch_size, max_sized_batch);

      if (int64_t trailing_rows = num_rows % batch_size) {
        batches.push_back(max_sized_batch->Slice(0, trailing_rows));
      }
    }

    *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
        ::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));

    return Status::OK();
  }

  int64_t num_rows = 0;
  for (int row_group : row_groups) {
    num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
  }

  using ::arrow::RecordBatchIterator;

  // NB: This lambda will be invoked outside the scope of this call to
  // `GetRecordBatchReader()`, so it must capture `readers` and `batch_schema` by value.
  // `this` is a non-owning pointer so we are relying on the parent FileReader outliving
  // this RecordBatchReader.
  ::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
      [readers, batch_schema, num_rows,
       this]() mutable -> ::arrow::Result<RecordBatchIterator> {
        ::arrow::ChunkedArrayVector columns(readers.size());

        // don't reserve more rows than necessary
        int64_t batch_size = std::min(properties().batch_size(), num_rows);
        num_rows -= batch_size;

        RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
            reader_properties_.use_threads(), static_cast<int>(readers.size()),
            [&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));

        for (const auto& column : columns) {
          if (column == nullptr || column->length() == 0) {
            return ::arrow::IterationTraits<RecordBatchIterator>::End();
          }
        }

        auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
        auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);

        // NB: explicitly preserve table so that table_reader doesn't outlive it
        return ::arrow::MakeFunctionIterator(
            [table, table_reader] { return table_reader->Next(); });
      });

  *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
      ::arrow::MakeFlattenIterator(std::move(batches)), std::move(batch_schema));

  return Status::OK();
}

关键在这里
batches是个迭代器，里面包含了各个 columnreader

   *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
        ::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));

GetReader

// ----------------------------------------------------------------------
// File reader implementation

Status GetReader(const SchemaField& field, const std::shared_ptr<Field>& arrow_field,
                 const std::shared_ptr<ReaderContext>& ctx,
                 std::unique_ptr<ColumnReaderImpl>* out) {
  BEGIN_PARQUET_CATCH_EXCEPTIONS

  auto type_id = arrow_field->type()->id();

  if (type_id == ::arrow::Type::EXTENSION) {
    auto storage_field = arrow_field->WithType(
        checked_cast<const ExtensionType&>(*arrow_field->type()).storage_type());
    RETURN_NOT_OK(GetReader(field, storage_field, ctx, out));
    out->reset(new ExtensionReader(arrow_field, std::move(*out)));
    return Status::OK();
  }

  if (field.children.size() == 0) {
    if (!field.is_leaf()) {
      return Status::Invalid("Parquet non-leaf node has no children");
    }
    if (!ctx->IncludesLeaf(field.column_index)) {
      *out = nullptr;
      return Status::OK();
    }
    std::unique_ptr<FileColumnIterator> input(
        ctx->iterator_factory(field.column_index, ctx->reader));
    out->reset(new LeafReader(ctx, arrow_field, std::move(input), field.level_info));
  } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP ||
             type_id == ::arrow::Type::FIXED_SIZE_LIST ||
             type_id == ::arrow::Type::LARGE_LIST) {
    auto list_field = arrow_field;
    auto child = &field.children[0];
    std::unique_ptr<ColumnReaderImpl> child_reader;
    RETURN_NOT_OK(GetReader(*child, ctx, &child_reader));
    if (child_reader == nullptr) {
      *out = nullptr;
      return Status::OK();
    }
    if (type_id == ::arrow::Type::LIST ||
        type_id == ::arrow::Type::MAP) {  // Map can be reconstructed as list of structs.
      if (type_id == ::arrow::Type::MAP &&
          child_reader->field()->type()->num_fields() != 2) {
        // This case applies if either key or value is filtered.
        list_field = list_field->WithType(::arrow::list(child_reader->field()));
      }
      out->reset(new ListReader<int32_t>(ctx, list_field, field.level_info,
                                         std::move(child_reader)));
    } else if (type_id == ::arrow::Type::LARGE_LIST) {
      out->reset(new ListReader<int64_t>(ctx, list_field, field.level_info,
                                         std::move(child_reader)));

    } else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) {
      out->reset(new FixedSizeListReader(ctx, list_field, field.level_info,
                                         std::move(child_reader)));
    } else {
      return Status::UnknownError("Unknown list type: ", field.field->ToString());
    }
  } else if (type_id == ::arrow::Type::STRUCT) {
    std::vector<std::shared_ptr<Field>> child_fields;
    std::vector<std::unique_ptr<ColumnReaderImpl>> child_readers;
    for (const auto& child : field.children) {
      std::unique_ptr<ColumnReaderImpl> child_reader;
      RETURN_NOT_OK(GetReader(child, ctx, &child_reader));
      if (!child_reader) {
        // If all children were pruned, then we do not try to read this field
        continue;
      }
      child_fields.push_back(child.field);
      child_readers.emplace_back(std::move(child_reader));
    }
    if (child_fields.size() == 0) {
      *out = nullptr;
      return Status::OK();
    }
    auto filtered_field =
        ::arrow::field(arrow_field->name(), ::arrow::struct_(child_fields),
                       arrow_field->nullable(), arrow_field->metadata());
    out->reset(new StructReader(ctx, filtered_field, field.level_info,
                                std::move(child_readers)));
  } else {
    return Status::Invalid("Unsupported nested type: ", arrow_field->ToString());
  }
  return Status::OK();

  END_PARQUET_CATCH_EXCEPTIONS
}

Page read

cpp/src/parquet/column_reader.cc

std::shared_ptr<Page> SerializedPageReader::NextPage() {
...
	PARQUET_ASSIGN_OR_THROW(auto page_buffer, stream_->Read(compressed_len));
	if (page_buffer->size() != compressed_len) {
      std::stringstream ss;
      ss << "Page was smaller (" << page_buffer->size() << ") than expected ("
         << compressed_len << ")";
      ParquetException::EofException(ss.str());
    }
...
}