先贴原代码,ANTLRInputStream.h:
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#pragma once
#include <string_view>
#include "CharStream.h"
namespace antlr4 {
// Vacuum all input from a stream and then treat it
// like a string. Can also pass in a string or char[] to use.
// Input is expected to be encoded in UTF-8 and converted to UTF-32 internally.
class ANTLR4CPP_PUBLIC ANTLRInputStream : public CharStream {
protected:
/// The data being scanned.
// UTF-32
std::u32string _data;
/// 0..n-1 index into string of next char </summary>
size_t p;
public:
/// What is name or source of this char stream?
std::string name;
ANTLRInputStream();
ANTLRInputStream(std::string_view input);
ANTLRInputStream(const char *data, size_t length);
ANTLRInputStream(std::istream &stream);
virtual void load(const std::string &input, bool lenient);
virtual void load(const char *data, size_t length, bool lenient);
virtual void load(std::istream &stream, bool lenient);
virtual void load(const std::string &input) { load(input, false); }
virtual void load(const char *data, size_t length) { load(data, length, false); }
virtual void load(std::istream &stream) { load(stream, false); }
/// Reset the stream so that it's in the same state it was
/// when the object was created *except* the data array is not
/// touched.
virtual void reset();
virtual void consume() override;
virtual size_t LA(ssize_t i) override;
virtual size_t LT(ssize_t i);
/// <summary>
/// Return the current input symbol index 0..n where n indicates the
/// last symbol has been read. The index is the index of char to
/// be returned from LA(1).
/// </summary>
virtual size_t index() override;
virtual size_t size() override;
/// <summary>
/// mark/release do nothing; we have entire buffer </summary>
virtual ssize_t mark() override;
virtual void release(ssize_t marker) override;
/// <summary>
/// consume() ahead until p==index; can't just set p=index as we must
/// update line and charPositionInLine. If we seek backwards, just set p
/// </summary>
virtual void seek(size_t index) override;
virtual std::string getText(const misc::Interval &interval) override;
virtual std::string getSourceName() const override;
virtual std::string toString() const override;
private:
void InitializeInstanceFields();
};
} // namespace antlr4
ANTLRInputStream.cpp:
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
#include <string.h>
#include "Exceptions.h"
#include "misc/Interval.h"
#include "IntStream.h"
#include "support/Utf8.h"
#include "support/CPPUtils.h"
#include "ANTLRInputStream.h"
using namespace antlr4;
using namespace antlrcpp;
using misc::Interval;
ANTLRInputStream::ANTLRInputStream() {
InitializeInstanceFields();
}
ANTLRInputStream::ANTLRInputStream(std::string_view input): ANTLRInputStream() {
load(input.data(), input.length());
}
ANTLRInputStream::ANTLRInputStream(const char *data, size_t length) {
load(data, length);
}
ANTLRInputStream::ANTLRInputStream(std::istream &stream): ANTLRInputStream() {
load(stream);
}
void ANTLRInputStream::load(const std::string &input, bool lenient) {
load(input.data(), input.size(), lenient);
}
void ANTLRInputStream::load(const char *data, size_t length, bool lenient) {
// Remove the UTF-8 BOM if present.
const char *bom = "\xef\xbb\xbf";
if (length >= 3 && strncmp(data, bom, 3) == 0) {
data += 3;
length -= 3;
}
if (lenient) {
_data = Utf8::lenientDecode(std::string_view(data, length));
} else {
auto maybe_utf32 = Utf8::strictDecode(std::string_view(data, length));
if (!maybe_utf32.has_value()) {
throw IllegalArgumentException("UTF-8 string contains an illegal byte sequence");
}
_data = std::move(maybe_utf32).value();
}
p = 0;
}
void ANTLRInputStream::load(std::istream &stream, bool lenient) {
if (!stream.good() || stream.eof()) // No fail, bad or EOF.
return;
_data.clear();
std::string s((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
load(s.data(), s.length(), lenient);
}
void ANTLRInputStream::reset() {
p = 0;
}
void ANTLRInputStream::consume() {
if (p >= _data.size()) {
assert(LA(1) == IntStream::EOF);
throw IllegalStateException("cannot consume EOF");
}
if (p < _data.size()) {
p++;
}
}
size_t ANTLRInputStream::LA(ssize_t i) {
if (i == 0) {
return 0; // undefined
}
ssize_t position = static_cast<ssize_t>(p);
if (i < 0) {
i++; // e.g., translate LA(-1) to use offset i=0; then _data[p+0-1]
if ((position + i - 1) < 0) {
return IntStream::EOF; // invalid; no char before first char
}
}
if ((position + i - 1) >= static_cast<ssize_t>(_data.size())) {
return IntStream::EOF;
}
return _data[static_cast<size_t>((position + i - 1))];
}
size_t ANTLRInputStream::LT(ssize_t i) {
return LA(i);
}
size_t ANTLRInputStream::index() {
return p;
}
size_t ANTLRInputStream::size() {
return _data.size();
}
// Mark/release do nothing. We have entire buffer.
ssize_t ANTLRInputStream::mark() {
return -1;
}
void ANTLRInputStream::release(ssize_t /* marker */) {
}
void ANTLRInputStream::seek(size_t index) {
if (index <= p) {
p = index; // just jump; don't update stream state (line, ...)
return;
}
// seek forward, consume until p hits index or n (whichever comes first)
index = std::min(index, _data.size());
while (p < index) {
consume();
}
}
std::string ANTLRInputStream::getText(const Interval &interval) {
if (interval.a < 0 || interval.b < 0) {
return "";
}
size_t start = static_cast<size_t>(interval.a);
size_t stop = static_cast<size_t>(interval.b);
if (stop >= _data.size()) {
stop = _data.size() - 1;
}
size_t count = stop - start + 1;
if (start >= _data.size()) {
return "";
}
auto maybeUtf8 = Utf8::strictEncode(std::u32string_view(_data).substr(start, count));
if (!maybeUtf8.has_value()) {
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
}
return std::move(maybeUtf8).value();
}
std::string ANTLRInputStream::getSourceName() const {
if (name.empty()) {
return IntStream::UNKNOWN_SOURCE_NAME;
}
return name;
}
std::string ANTLRInputStream::toString() const {
auto maybeUtf8 = Utf8::strictEncode(_data);
if (!maybeUtf8.has_value()) {
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
}
return std::move(maybeUtf8).value();
}
void ANTLRInputStream::InitializeInstanceFields() {
p = 0;
}
这段代码实现了一个用于处理输入流的类,可以将输入数据视为字符串进行操作,并提供了一系列方法来操作输入数据的不同方面。
.h头文件中定义了一个CharStream.h头文件,该文件定义了一个用于 ANTLR 解析器的字符流输入源的抽象类 CharStream
,提供了获取字符范围内文本和返回字符串表示形式的接口,并指定了具体的实现细节需要在派生类中实现。
在命名空间antlr4中,定义了一个类ANTLRInputStream。
以下是该类的主要功能和实现细节:
- 构造函数:包括默认构造函数和从字符串、字符数组、输入流等不同输入源加载数据的构造函数。
load
方法:用于加载输入数据,支持从字符串、字符数组或输入流加载数据,并提供了对输入数据的严格和宽松解码选项。reset
方法:重置流的状态,将读取位置置为初始状态。consume
方法:消耗当前位置的字符。LA
和LT
方法:用于获取指定相对位置的字符。index
方法:返回当前输入符号的索引。size
方法:返回输入符号的总数。mark
和release
方法:标记和释放方法,但在此处不执行实际操作。seek
方法:将读取位置移动到指定的索引处。getText
方法:获取指定区间内的文本。getSourceName
方法:返回输入源的名称。toString
方法:返回ANTLRInputStream
对象的字符串表示形式。此外,代码中还包括一些辅助函数,如InitializeInstanceFields
函数用于初始化成员变量。
其中一些笔者看不明白的点在查阅资料后解决:
- 关于std::string_view:https://www.cnblogs.com/yangxunwu1992/p/14018837.html
- 关于virtual和override:加了关键字virtual构成虚函数,虚函数使用的其核心目的是通过基类访问派生类定义的函数。所谓虚函数就是在基类定义一个未实现的函数名,所有可以在其子类重新定义父类的做法这种行为成为覆盖(override),或者为重写。加了override,明确表示派生类的这个虚函数是重写基类的,如果派生类与基类虚函数的签名不一致,编译器就会报错,因此,为了减少程序运行时的错误,重写的虚函数都建议加上 override。C++虚函数详解-CSDN博客