ANTLR4学习笔记--ANTLRInputStream

左家垅菜腿

已于 2024-04-23 14:16:08 修改

阅读量482

点赞数 5

分类专栏： ANTLR4 文章标签：学习笔记算法

于 2024-04-23 11:23:38 首次发布

本文链接：https://blog.csdn.net/rilegoule_/article/details/138116613

版权

ANTLR4 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

先贴原代码，ANTLRInputStream.h:

/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */

#pragma once

#include <string_view>

#include "CharStream.h"

namespace antlr4 {

  // Vacuum all input from a stream and then treat it
  // like a string. Can also pass in a string or char[] to use.
  // Input is expected to be encoded in UTF-8 and converted to UTF-32 internally.
  class ANTLR4CPP_PUBLIC ANTLRInputStream : public CharStream {
  protected:
    /// The data being scanned.
    // UTF-32
    std::u32string _data;

    /// 0..n-1 index into string of next char </summary>
    size_t p;

  public:
    /// What is name or source of this char stream?
    std::string name;

    ANTLRInputStream();

    ANTLRInputStream(std::string_view input);

    ANTLRInputStream(const char *data, size_t length);
    ANTLRInputStream(std::istream &stream);

    virtual void load(const std::string &input, bool lenient);
    virtual void load(const char *data, size_t length, bool lenient);
    virtual void load(std::istream &stream, bool lenient);

    virtual void load(const std::string &input) { load(input, false); }
    virtual void load(const char *data, size_t length) { load(data, length, false); }
    virtual void load(std::istream &stream) { load(stream, false); }

    /// Reset the stream so that it's in the same state it was
    /// when the object was created *except* the data array is not
    /// touched.
    virtual void reset();
    virtual void consume() override;
    virtual size_t LA(ssize_t i) override;
    virtual size_t LT(ssize_t i);

    /// <summary>
    /// Return the current input symbol index 0..n where n indicates the
    ///  last symbol has been read.  The index is the index of char to
    ///  be returned from LA(1).
    /// </summary>
    virtual size_t index() override;
    virtual size_t size() override;

    /// <summary>
    /// mark/release do nothing; we have entire buffer </summary>
    virtual ssize_t mark() override;
    virtual void release(ssize_t marker) override;

    /// <summary>
    /// consume() ahead until p==index; can't just set p=index as we must
    ///  update line and charPositionInLine. If we seek backwards, just set p
    /// </summary>
    virtual void seek(size_t index) override;
    virtual std::string getText(const misc::Interval &interval) override;
    virtual std::string getSourceName() const override;
    virtual std::string toString() const override;

  private:
    void InitializeInstanceFields();
  };

} // namespace antlr4

ANTLRInputStream.cpp:

/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */

#include <string.h>

#include "Exceptions.h"
#include "misc/Interval.h"
#include "IntStream.h"

#include "support/Utf8.h"
#include "support/CPPUtils.h"

#include "ANTLRInputStream.h"

using namespace antlr4;
using namespace antlrcpp;

using misc::Interval;

ANTLRInputStream::ANTLRInputStream() {
  InitializeInstanceFields();
}

ANTLRInputStream::ANTLRInputStream(std::string_view input): ANTLRInputStream() {
  load(input.data(), input.length());
}

ANTLRInputStream::ANTLRInputStream(const char *data, size_t length) {
  load(data, length);
}

ANTLRInputStream::ANTLRInputStream(std::istream &stream): ANTLRInputStream() {
  load(stream);
}

void ANTLRInputStream::load(const std::string &input, bool lenient) {
  load(input.data(), input.size(), lenient);
}

void ANTLRInputStream::load(const char *data, size_t length, bool lenient) {
  // Remove the UTF-8 BOM if present.
  const char *bom = "\xef\xbb\xbf";
  if (length >= 3 && strncmp(data, bom, 3) == 0) {
    data += 3;
    length -= 3;
  }
  if (lenient) {
    _data = Utf8::lenientDecode(std::string_view(data, length));
  } else {
    auto maybe_utf32 = Utf8::strictDecode(std::string_view(data, length));
    if (!maybe_utf32.has_value()) {
      throw IllegalArgumentException("UTF-8 string contains an illegal byte sequence");
    }
    _data = std::move(maybe_utf32).value();
  }
  p = 0;
}

void ANTLRInputStream::load(std::istream &stream, bool lenient) {
  if (!stream.good() || stream.eof()) // No fail, bad or EOF.
    return;

  _data.clear();

  std::string s((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
  load(s.data(), s.length(), lenient);
}

void ANTLRInputStream::reset() {
  p = 0;
}

void ANTLRInputStream::consume() {
  if (p >= _data.size()) {
    assert(LA(1) == IntStream::EOF);
    throw IllegalStateException("cannot consume EOF");
  }

  if (p < _data.size()) {
    p++;
  }
}

size_t ANTLRInputStream::LA(ssize_t i) {
  if (i == 0) {
    return 0; // undefined
  }

  ssize_t position = static_cast<ssize_t>(p);
  if (i < 0) {
    i++; // e.g., translate LA(-1) to use offset i=0; then _data[p+0-1]
    if ((position + i - 1) < 0) {
      return IntStream::EOF; // invalid; no char before first char
    }
  }

  if ((position + i - 1) >= static_cast<ssize_t>(_data.size())) {
    return IntStream::EOF;
  }

  return _data[static_cast<size_t>((position + i - 1))];
}

size_t ANTLRInputStream::LT(ssize_t i) {
  return LA(i);
}

size_t ANTLRInputStream::index() {
  return p;
}

size_t ANTLRInputStream::size() {
  return _data.size();
}

// Mark/release do nothing. We have entire buffer.
ssize_t ANTLRInputStream::mark() {
  return -1;
}

void ANTLRInputStream::release(ssize_t /* marker */) {
}

void ANTLRInputStream::seek(size_t index) {
  if (index <= p) {
    p = index; // just jump; don't update stream state (line, ...)
    return;
  }
  // seek forward, consume until p hits index or n (whichever comes first)
  index = std::min(index, _data.size());
  while (p < index) {
    consume();
  }
}

std::string ANTLRInputStream::getText(const Interval &interval) {
  if (interval.a < 0 || interval.b < 0) {
    return "";
  }

  size_t start = static_cast<size_t>(interval.a);
  size_t stop = static_cast<size_t>(interval.b);


  if (stop >= _data.size()) {
    stop = _data.size() - 1;
  }

  size_t count = stop - start + 1;
  if (start >= _data.size()) {
    return "";
  }

  auto maybeUtf8 = Utf8::strictEncode(std::u32string_view(_data).substr(start, count));
  if (!maybeUtf8.has_value()) {
    throw IllegalArgumentException("Input stream contains invalid Unicode code points");
  }
  return std::move(maybeUtf8).value();
}

std::string ANTLRInputStream::getSourceName() const {
  if (name.empty()) {
    return IntStream::UNKNOWN_SOURCE_NAME;
  }
  return name;
}

std::string ANTLRInputStream::toString() const {
  auto maybeUtf8 = Utf8::strictEncode(_data);
  if (!maybeUtf8.has_value()) {
    throw IllegalArgumentException("Input stream contains invalid Unicode code points");
  }
  return std::move(maybeUtf8).value();
}

void ANTLRInputStream::InitializeInstanceFields() {
  p = 0;
}

这段代码实现了一个用于处理输入流的类，可以将输入数据视为字符串进行操作，并提供了一系列方法来操作输入数据的不同方面。

.h头文件中定义了一个CharStream.h头文件，该文件定义了一个用于 ANTLR 解析器的字符流输入源的抽象类 CharStream，提供了获取字符范围内文本和返回字符串表示形式的接口，并指定了具体的实现细节需要在派生类中实现。

在命名空间antlr4中，定义了一个类ANTLRInputStream。

以下是该类的主要功能和实现细节：

构造函数：包括默认构造函数和从字符串、字符数组、输入流等不同输入源加载数据的构造函数。
load 方法：用于加载输入数据，支持从字符串、字符数组或输入流加载数据，并提供了对输入数据的严格和宽松解码选项。
reset 方法：重置流的状态，将读取位置置为初始状态。
consume 方法：消耗当前位置的字符。
LA 和 LT 方法：用于获取指定相对位置的字符。
index 方法：返回当前输入符号的索引。
size 方法：返回输入符号的总数。
mark 和 release 方法：标记和释放方法，但在此处不执行实际操作。
seek 方法：将读取位置移动到指定的索引处。
getText 方法：获取指定区间内的文本。
getSourceName 方法：返回输入源的名称。
toString 方法：返回 ANTLRInputStream 对象的字符串表示形式。此外，代码中还包括一些辅助函数，如 InitializeInstanceFields 函数用于初始化成员变量。

其中一些笔者看不明白的点在查阅资料后解决：

关于std::string_view：https://www.cnblogs.com/yangxunwu1992/p/14018837.html
关于virtual和override：加了关键字virtual构成虚函数，虚函数使用的其核心目的是通过基类访问派生类定义的函数。所谓虚函数就是在基类定义一个未实现的函数名，所有可以在其子类重新定义父类的做法这种行为成为覆盖（override）,或者为重写。加了override，明确表示派生类的这个虚函数是重写基类的，如果派生类与基类虚函数的签名不一致，编译器就会报错，因此，为了减少程序运行时的错误，重写的虚函数都建议加上 override。C++虚函数详解-CSDN博客