Arrow 之 IPC

RecordBatchStreamReader
// ----------------------------------------------------------------------
// RecordBatchStreamReader implementation

class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
 public:
  Status Open(std::unique_ptr<MessageReader> message_reader,
              const IpcReadOptions& options) {
    message_reader_ = std::move(message_reader);
    options_ = options;

    // Read schema
    ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Message> message, ReadNextMessage());
    if (!message) {
      return Status::Invalid("Tried reading schema message, was null or length 0");
    }

    RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_,
                                      &out_schema_, &field_inclusion_mask_,
                                      &swap_endian_));
    return Status::OK();
  }

  Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
    if (!have_read_initial_dictionaries_) {
      RETURN_NOT_OK(ReadInitialDictionaries());
    }

    if (empty_stream_) {
      // ARROW-6006: Degenerate case where stream contains no data, we do not
      // bother trying to read a RecordBatch message from the stream
      *batch = nullptr;
      return Status::OK();
    }

    // Continue to read other dictionaries, if any
    std::unique_ptr<Message> message;
    ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());

    while (message != nullptr && message->type() == MessageType::DICTIONARY_BATCH) {
      RETURN_NOT_OK(ReadDictionary(*message));
      ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
    }

    if (message == nullptr) {
      // End of stream
      *batch = nullptr;
      return Status::OK();
    }

    CHECK_HAS_BODY(*message);
    ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
    IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
    return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_,
                                   context, reader.get())
        .Value(batch);
  }

  std::shared_ptr<Schema> schema() const override { return out_schema_; }

  ReadStats stats() const override { return stats_; }

 private:
  Result<std::unique_ptr<Message>> ReadNextMessage() {
    ARROW_ASSIGN_OR_RAISE(auto message, message_reader_->ReadNextMessage());
    if (message) {
      ++stats_.num_messages;
      switch (message->type()) {
        case MessageType::RECORD_BATCH:
          ++stats_.num_record_batches;
          break;
        case MessageType::DICTIONARY_BATCH:
          ++stats_.num_dictionary_batches;
          break;
        default:
          break;
      }
    }
    return std::move(message);
  }

  // Read dictionary from dictionary batch
  Status ReadDictionary(const Message& message) {
    DictionaryKind kind;
    IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
    RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
    switch (kind) {
      case DictionaryKind::New:
        break;
      case DictionaryKind::Delta:
        ++stats_.num_dictionary_deltas;
        break;
      case DictionaryKind::Replacement:
        ++stats_.num_replaced_dictionaries;
        break;
    }
    return Status::OK();
  }

  Status ReadInitialDictionaries() {
    // We must receive all dictionaries before reconstructing the
    // first record batch. Subsequent dictionary deltas modify the memo
    std::unique_ptr<Message> message;

    // TODO(wesm): In future, we may want to reconcile the ids in the stream with
    // those found in the schema
    const auto num_dicts = dictionary_memo_.fields().num_dicts();
    for (int i = 0; i < num_dicts; ++i) {
      ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
      if (!message) {
        if (i == 0) {
          /// ARROW-6006: If we fail to find any dictionaries in the stream, then
          /// it may be that the stream has a schema but no actual data. In such
          /// case we communicate that we were unable to find the dictionaries
          /// (but there was no failure otherwise), so the caller can decide what
          /// to do
          empty_stream_ = true;
          break;
        } else {
          // ARROW-6126, the stream terminated before receiving the expected
          // number of dictionaries
          return Status::Invalid("IPC stream ended without reading the expected number (",
                                 num_dicts, ") of dictionaries");
        }
      }

      if (message->type() != MessageType::DICTIONARY_BATCH) {
        return Status::Invalid("IPC stream did not have the expected number (", num_dicts,
                               ") of dictionaries at the start of the stream");
      }
      RETURN_NOT_OK(ReadDictionary(*message));
    }

    have_read_initial_dictionaries_ = true;
    return Status::OK();
  }

  std::unique_ptr<MessageReader> message_reader_;
  IpcReadOptions options_;
  std::vector<bool> field_inclusion_mask_;

  bool have_read_initial_dictionaries_ = false;

  // Flag to set in case where we fail to observe all dictionaries in a stream,
  // and so the reader should not attempt to parse any messages
  bool empty_stream_ = false;

  ReadStats stats_;

  DictionaryMemo dictionary_memo_;
  std::shared_ptr<Schema> schema_, out_schema_;

  bool swap_endian_;
};
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
ipc format是指进程间通信(Inter-Process Communication)的数据格式。在引用中的上下文中,ipc format可能是指一种用于数据交换的格式。而在引用和中的上下文中,ipc format可能是指一种被称为ARROW_PRE_0_15_IPC_FORMAT的环境变量或配置项。这些配置项用于在Spark应用程序中设置ARROW_PRE_0_15_IPC_FORMAT参数,以控制进程间通信数据的格式。具体而言,根据不同的运行环境,可以使用不同的配置项来设置ARROW_PRE_0_15_IPC_FORMAT参数。在本例中,如果是在本地运行,可以通过修改CMake文件或源代码中的配置项来设置ARROW_PRE_0_15_IPC_FORMAT参数为1。而如果是在yarn集群上运行,可以直接通过添加相应的配置项来设置ARROW_PRE_0_15_IPC_FORMAT参数为1。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [IPC:增量电势接触(IPC)用于非线性弹性动力学的稳健而精确的时间步长。 IPC保证了无相交和无反转的轨迹,...](https://download.csdn.net/download/weixin_42115003/15960749)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] - *2* *3* [pandas_udf频繁报错解决方案](https://blog.csdn.net/weixin_45736572/article/details/122812241)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值