testing driver
主要涉及两个类 FileReader,RecordBatchReader
RecordBatchReader 实例对象 通过 arquet_reader->GetRecordBatchReader 获取
std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
std::shared_ptr<arrow::RecordBatchReader> record_batch_reader;
ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
::arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
properties, &parquet_reader));
std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
row_group_indices, local_column_indices, &record_batch_reader));
do {
TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
if (record_batch) {
// batches.push_back(record_batch);
num_batches += 1;
num_rows += record_batch->num_rows();
}
} while (record_batch);
GetRecordBatchReader
reader.cc
Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
const std::vector<int>& column_indices,
std::unique_ptr<RecordBatchReader>* out) {
RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
if (reader_properties_.pre_buffer()) {
// PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
BEGIN_PARQUET_CATCH_EXCEPTIONS
ARROW_UNUSED(reader_->PreBuffer(row_groups, column_indices,
reader_properties_.io_context(),
reader_properties_.cache_options()));
END_PARQUET_CATCH_EXCEPTIONS
}
std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
std::shared_ptr<::arrow::Schema> batch_schema;
RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));
if (readers.empty()) {
// Just generate all batches right now; they're cheap since they have no columns.
int64_t batch_size = properties().batch_size();
auto max_sized_batch =
::arrow::RecordBatch::Make(batch_schema, batch_size, ::arrow::ArrayVector{});
::arrow::RecordBatchVector batches;
for (int row_group : row_groups) {
int64_t num_rows = parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
batches.insert(batches.end(), num_rows / batch_size, max_sized_batch);
if (int64_t trailing_rows = num_rows % batch_size) {
batches.push_back(max_sized_batch->Slice(0, trailing_rows));
}
}
*out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));
return Status::OK();
}
int64_t num_rows = 0;
for (int row_group : row_groups) {
num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
}
using ::arrow::RecordBatchIterator;
// NB: This lambda will be invoked outside the scope of this call to
// `GetRecordBatchReader()`, so it must capture `readers` and `batch_schema` by value.
// `this` is a non-owning pointer so we are relying on the parent FileReader outliving
// this RecordBatchReader.
::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
[readers, batch_schema, num_rows,
this]() mutable -> ::arrow::Result<RecordBatchIterator> {
::arrow::ChunkedArrayVector columns(readers.size());
// don't reserve more rows than necessary
int64_t batch_size = std::min(properties().batch_size(), num_rows);
num_rows -= batch_size;
RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
reader_properties_.use_threads(), static_cast<int>(readers.size()),
[&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));
for (const auto& column : columns) {
if (column == nullptr || column->length() == 0) {
return ::arrow::IterationTraits<RecordBatchIterator>::End();
}
}
auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
// NB: explicitly preserve table so that table_reader doesn't outlive it
return ::arrow::MakeFunctionIterator(
[table, table_reader] { return table_reader->Next(); });
});
*out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
::arrow::MakeFlattenIterator(std::move(batches)), std::move(batch_schema));
return Status::OK();
}
关键在这里
batches是个迭代器, 里面包含了各个 columnreader
*out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));
GetReader
// ----------------------------------------------------------------------
// File reader implementation
Status GetReader(const SchemaField& field, const std::shared_ptr<Field>& arrow_field,
const std::shared_ptr<ReaderContext>& ctx,
std::unique_ptr<ColumnReaderImpl>* out) {
BEGIN_PARQUET_CATCH_EXCEPTIONS
auto type_id = arrow_field->type()->id();
if (type_id == ::arrow::Type::EXTENSION) {
auto storage_field = arrow_field->WithType(
checked_cast<const ExtensionType&>(*arrow_field->type()).storage_type());
RETURN_NOT_OK(GetReader(field, storage_field, ctx, out));
out->reset(new ExtensionReader(arrow_field, std::move(*out)));
return Status::OK();
}
if (field.children.size() == 0) {
if (!field.is_leaf()) {
return Status::Invalid("Parquet non-leaf node has no children");
}
if (!ctx->IncludesLeaf(field.column_index)) {
*out = nullptr;
return Status::OK();
}
std::unique_ptr<FileColumnIterator> input(
ctx->iterator_factory(field.column_index, ctx->reader));
out->reset(new LeafReader(ctx, arrow_field, std::move(input), field.level_info));
} else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP ||
type_id == ::arrow::Type::FIXED_SIZE_LIST ||
type_id == ::arrow::Type::LARGE_LIST) {
auto list_field = arrow_field;
auto child = &field.children[0];
std::unique_ptr<ColumnReaderImpl> child_reader;
RETURN_NOT_OK(GetReader(*child, ctx, &child_reader));
if (child_reader == nullptr) {
*out = nullptr;
return Status::OK();
}
if (type_id == ::arrow::Type::LIST ||
type_id == ::arrow::Type::MAP) { // Map can be reconstructed as list of structs.
if (type_id == ::arrow::Type::MAP &&
child_reader->field()->type()->num_fields() != 2) {
// This case applies if either key or value is filtered.
list_field = list_field->WithType(::arrow::list(child_reader->field()));
}
out->reset(new ListReader<int32_t>(ctx, list_field, field.level_info,
std::move(child_reader)));
} else if (type_id == ::arrow::Type::LARGE_LIST) {
out->reset(new ListReader<int64_t>(ctx, list_field, field.level_info,
std::move(child_reader)));
} else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) {
out->reset(new FixedSizeListReader(ctx, list_field, field.level_info,
std::move(child_reader)));
} else {
return Status::UnknownError("Unknown list type: ", field.field->ToString());
}
} else if (type_id == ::arrow::Type::STRUCT) {
std::vector<std::shared_ptr<Field>> child_fields;
std::vector<std::unique_ptr<ColumnReaderImpl>> child_readers;
for (const auto& child : field.children) {
std::unique_ptr<ColumnReaderImpl> child_reader;
RETURN_NOT_OK(GetReader(child, ctx, &child_reader));
if (!child_reader) {
// If all children were pruned, then we do not try to read this field
continue;
}
child_fields.push_back(child.field);
child_readers.emplace_back(std::move(child_reader));
}
if (child_fields.size() == 0) {
*out = nullptr;
return Status::OK();
}
auto filtered_field =
::arrow::field(arrow_field->name(), ::arrow::struct_(child_fields),
arrow_field->nullable(), arrow_field->metadata());
out->reset(new StructReader(ctx, filtered_field, field.level_info,
std::move(child_readers)));
} else {
return Status::Invalid("Unsupported nested type: ", arrow_field->ToString());
}
return Status::OK();
END_PARQUET_CATCH_EXCEPTIONS
}
Page read
cpp/src/parquet/column_reader.cc
std::shared_ptr<Page> SerializedPageReader::NextPage() {
...
PARQUET_ASSIGN_OR_THROW(auto page_buffer, stream_->Read(compressed_len));
if (page_buffer->size() != compressed_len) {
std::stringstream ss;
ss << "Page was smaller (" << page_buffer->size() << ") than expected ("
<< compressed_len << ")";
ParquetException::EofException(ss.str());
}
...
}