Velox Build
make release EXTRA_CMAKE_FLAGS=" -DVELOX_ENABLE_PARQUET=ON -DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_BENCHMARKS=ON"
相关 options
CXXFLAGS="-Wno-error"
add_compile_options(-g -O0 -Wno-error -Wno-sign-compare -Wno-unused-function)
UT 所在目录
_build/release/velox/exec/tests
Velox Functions
velox/velox/functions/Macros.h
#define VELOX_UDF_BEGIN(Name) \
struct udf_##Name { \
template <typename __Velox_ExecParams> \
struct udf { \
template <typename __Velox_TArg> \
using arg_type = typename __Velox_ExecParams::template resolver< \
__Velox_TArg>::in_type; \
\
template <typename __Velox_TArg> \
using out_type = typename __Velox_ExecParams::template resolver< \
__Velox_TArg>::out_type; \
\
template <typename __Velox_TArg> \
using opt_arg_type = \
std::optional<typename __Velox_ExecParams::template resolver< \
__Velox_TArg>::in_type>; \
\
template <typename __Velox_TArg> \
using opt_out_type = \
std::optional<typename __Velox_ExecParams::template resolver< \
__Velox_TArg>::out_type>; \
\
template <typename __Velox_TKey, typename __Velox_TVal> \
using MapVal = \
arg_type<::facebook::velox::Map<__Velox_TKey, __Velox_TVal>>; \
template <typename __Velox_TElement> \
using ArrayVal = arg_type<::facebook::velox::Array<__Velox_TElement>>; \
using VarcharVal = arg_type<::facebook::velox::Varchar>; \
using VarbinaryVal = arg_type<::facebook::velox::Varbinary>; \
template <typename... __Velox_TArgs> \
using RowVal = arg_type<::facebook::velox::Row<__Velox_TArgs...>>; \
template <typename __Velox_TKey, typename __Velox_TVal> \
using MapWriter = \
out_type<::facebook::velox::Map<__Velox_TKey, __Velox_TVal>>; \
template <typename __Velox_TElement> \
using ArrayWriter = \
out_type<::facebook::velox::Array<__Velox_TElement>>; \
using VarcharWriter = out_type<::facebook::velox::Varchar>; \
using VarbinaryWriter = out_type<::facebook::velox::Varbinary>; \
template <typename... __Velox_TArgs> \
using RowWriter = out_type<::facebook::velox::Row<__Velox_TArgs...>>; \
static constexpr auto name = #Name;
#define VELOX_UDF_END() \
} \
; \
} \
;
Functions Registration
velox/velox/functions/lib/RegistrationHelpers.h
template <template <class> class T>
void registerUnaryNumeric(const std::vector<std::string>& aliases) {
registerUnaryIntegral<T>(aliases);
registerUnaryFloatingPoint<T>(aliases);
}
template <template <class> class T>
void registerUnaryIntegral(const std::vector<std::string>& aliases) {
registerFunction<T, int8_t, int8_t>(aliases);
registerFunction<T, int16_t, int16_t>(aliases);
registerFunction<T, int32_t, int32_t>(aliases);
registerFunction<T, int64_t, int64_t>(aliases);
}
template <template <class> class T>
void registerUnaryFloatingPoint(const std::vector<std::string>& aliases) {
registerFunction<T, double, double>(aliases);
registerFunction<T, float, float>(aliases);
}
template <typename Func, typename TReturn, typename... TArgs>
void registerFunction(
const std::vector<std::string>& aliases = {},
std::shared_ptr<const Type> returnType = nullptr) {
using funcClass = typename Func::template udf<exec::VectorExec>;
using holderClass =
core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}
// New registration function; mostly a copy from the function above, but taking
// the inner "udf" struct directly, instead of the wrapper. We can keep both for
// a while to maintain backwards compatibility, but the idea is to remove the
// one above eventually.
template <template <class> typename Func, typename TReturn, typename... TArgs>
void registerFunction(
const std::vector<std::string>& aliases = {},
std::shared_ptr<const Type> returnType = nullptr) {
using funcClass = Func<exec::VectorExec>;
using holderClass =
core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}
VELOX_REGISTER_VECTOR_FUNCTION
velox/velox/expression/VectorFunction.h
// Registers a vectorized UDF associated with a given tag.
// This should be used in the same namespace the declare macro is used in.
#define VELOX_REGISTER_VECTOR_FUNCTION(tag, name) \
{ \
extern void _VELOX_REGISTER_FUNC_NAME(tag)(const std::string&); \
_VELOX_REGISTER_FUNC_NAME(tag)(name); \
}
Register
./velox/functions/Registerer.h
template <typename Func, typename TReturn, typename... TArgs>
void registerFunction(
const std::vector<std::string>& aliases = {},
std::shared_ptr<const Type> returnType = nullptr) {
using funcClass = typename Func::template udf<exec::VectorExec>;
using holderClass =
core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}
// New registration function; mostly a copy from the function above, but taking
// the inner "udf" struct directly, instead of the wrapper. We can keep both for
// a while to maintain backwards compatibility, but the idea is to remove the
// one above eventually.
template <template <class> typename Func, typename TReturn, typename... TArgs>
void registerFunction(
const std::vector<std::string>& aliases = {},
std::shared_ptr<const Type> returnType = nullptr) {
using funcClass = Func<exec::VectorExec>;
using holderClass =
core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}
./velox/expression/SimpleFunctionRegistry.h
// This function should be called once and alone.
template <typename UDFHolder>
void registerSimpleFunction(
const std::vector<std::string>& names,
std::shared_ptr<const Type> returnType) {
SimpleFunctions()
.registerFunction<SimpleFunctionAdapterFactoryImpl<UDFHolder>>(
names, returnType);
}
./velox/expression/FunctionRegistry.h
public:
template <typename UDF>
void registerFunction(
const std::vector<std::string>& aliases = {},
std::shared_ptr<const Type> returnType = nullptr) {
auto metadata =
GetSingletonUdfMetadata<typename UDF::Metadata>(std::move(returnType));
auto&& names = aliases.empty()
? std::vector<std::string>{metadata->getName()}
: aliases;
registerFunctionInternal(names, metadata, [metadata]() {
return CreateUdf<UDF>(metadata->returnType());
});
}
....
void registerFunctionInternal(
const std::vector<std::string>& names,
const std::shared_ptr<const Metadata>& metadata,
typename FunctionEntry<Function, Metadata>::FunctionFactory factory) {
for (const auto& name : names) {
auto sanitizedName = sanitizeFunctionName(name);
if (registeredFunctions_.find(sanitizedName) ==
registeredFunctions_.end()) {
registeredFunctions_[sanitizedName] = SignatureMap{};
}
registeredFunctions_[sanitizedName][*metadata->signature()] =
std::make_unique<const FunctionEntry<Function, Metadata>>(
metadata, std::move(factory));
}
}
Get
./velox/functions/FunctionRegistry.cpp
FunctionSignatureMap getFunctionSignatures() {
FunctionSignatureMap result;
populateSimpleFunctionSignatures(result);
populateVectorFunctionSignatures(result);
return result;
}
...
void populateSimpleFunctionSignatures(FunctionSignatureMap& map) {
auto& simpleFunctions = exec::SimpleFunctions();
auto functionNames = simpleFunctions.getFunctionNames();
for (const auto& functionName : functionNames) {
auto signatures = simpleFunctions.getFunctionSignatures(functionName);
map[functionName] = signatures;
}
}
./velox/expression/FunctionRegistry.h
std::vector<const FunctionSignature*> getFunctionSignatures(
const std::string& name) {
std::vector<const FunctionSignature*> signatures;
if (auto signatureMap = getSignatureMap(name)) {
signatures.reserve(signatureMap->size());
for (const auto& pair : *signatureMap) {
signatures.emplace_back(&pair.first);
}
}
return signatures;
}
SignatureMap* getSignatureMap(const std::string& name) {
auto sanitizedName = sanitizeFunctionName(name);
auto it = registeredFunctions_.find(sanitizedName);
if (it != registeredFunctions_.end()) {
return &it->second;
}
return nullptr;
}
Velox evaluateOnce trace
FunctionBaseTest.h:
evaluateOnce
template <typename ReturnType, typename... Args>
std::optional<ReturnType> evaluateOnce(
const std::string& expr,
const RowVectorPtr rowVectorPtr) {
auto result =
evaluate<SimpleVector<EvalType<ReturnType>>>(expr, rowVectorPtr);
return result->isNullAt(0) ? std::optional<ReturnType>{}
: ReturnType(result->valueAt(0));
}
evaluate
template <typename T>
std::shared_ptr<T> evaluate(
const std::string& expression,
const RowVectorPtr& data) {
auto result = evaluate(expression, data);
return castEvaluateResult<T>(result, expression);
}
// Use this directly if you don't want it to cast the returned vector.
VectorPtr evaluate(const std::string& expression, const RowVectorPtr& data) {
auto rowType = std::dynamic_pointer_cast<const RowType>(data->type());
auto typedExpr = makeTypedExpr(expression, rowType);
return evaluate(typedExpr, data);
}
core::TypedExprPtr makeTypedExpr(
const std::string& text,
const RowTypePtr& rowType) {
auto untyped = parse::parseExpr(text, options_);
return core::Expressions::inferTypes(untyped, rowType, execCtx_.pool());
}
Expressions.cpp
// static
TypedExprPtr Expressions::inferTypes(
const std::shared_ptr<const core::IExpr>& expr,
const TypePtr& inputRow,
memory::MemoryPool* pool,
const VectorPtr& complexConstants) {
return inferTypes(expr, inputRow, {}, pool, complexConstants);
}
TypedExprPtr createWithImplicitCast(
const std::shared_ptr<const core::CallExpr>& expr,
const std::vector<TypedExprPtr>& inputs) {
auto adjusted = adjustLastNArguments(inputs, expr, inputs.size());
if (adjusted) {
return adjusted;
}
auto type = resolveTypeImpl(inputs, expr, false /*nullOnFailure*/);
return std::make_shared<CallTypedExpr>(
type, std::move(inputs), std::string{expr->getFunctionName()});
}
// Determine output type based on input types.
TypePtr resolveTypeImpl(
std::vector<TypedExprPtr> inputs,
const std::shared_ptr<const CallExpr>& expr,
bool nullOnFailure) {
VELOX_CHECK_NOT_NULL(Expressions::getResolverHook());
return Expressions::getResolverHook()(inputs, expr, nullOnFailure);
}
/usr/include/c++/9/bits/std_function.h:688
template<typename _Res, typename... _ArgTypes>
_Res
function<_Res(_ArgTypes...)>::
operator()(_ArgTypes... __args) const
{
if (_M_empty())
__throw_bad_function_call();
return _M_invoker(_M_functor, std::forward<_ArgTypes>(__args)...);
}
ArrayVector, MapVector and RowVector
Flat vectors of complex types ARRAY, MAP and ROW / STRUCT are represented using ArrayVector, MapVector and RowVector.
RowVector
Finally, RowVector stores values of type ROW (e.g. structs). In addition to the nulls buffer, it contains a list of child vectors.
std::vector<VectorPtr> children_;
Here is an example.
RowVector is used to represent a single column of type struct as well as a collection of columns that are being passed from one operator to the next during query execution.
// Let’s create two vectors of 64-bit integers and one vector of strings.
auto a = makeFlatVector<int64_t>({0, 1, 2, 3, 4, 5, 6});
auto b = makeFlatVector<int64_t>({0, 5, 10, 15, 20, 25, 30});
auto dow = makeFlatVector<std::string>(
{"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday"});
auto data = makeRowVector({"a", "b", "dow"}, {a, b, dow});
std::cout << std::endl
<< "> vectors a, b, dow: " << data->toString() << std::endl;
std::cout << data->toString(0, data->size()) << std::endl;
文件读写
./velox/common/file/File.h
// A read-only file. All methods in this object should be thread safe.
class ReadFile {
public:
virtual ~ReadFile() = default;
// Reads the data at [offset, offset + length) into the provided pre-allocated
// buffer 'buf'. The bytes are returned as a string_view pointing to 'buf'.
//
// This method should be thread safe.
virtual std::string_view
pread(uint64_t offset, uint64_t length, void* FOLLY_NONNULL buf) const = 0;
// Same as above, but returns owned data directly.
//
// This method should be thread safe.
virtual std::string pread(uint64_t offset, uint64_t length) const;
// Reads starting at 'offset' into the memory referenced by the
// Ranges in 'buffers'. The buffers are filled left to right. A
// buffer with nullptr data will cause its size worth of bytes to be skipped.
//
// This method should be thread safe.
virtual uint64_t preadv(
uint64_t /*offset*/,
const std::vector<folly::Range<char*>>& /*buffers*/) const;
// Vectorized read API. Implementations can coalesce and parallelize.
// The offsets don't need to be sorted.
// `iobufs` is a range of IOBufs to store the read data. They
// will be stored in the same order as the input `regions` vector. So the
// array must be pre-allocated by the caller, with the same size as `regions`,
// but don't need to be initialized, since each iobuf will be copy-constructed
// by the preadv.
//
// This method should be thread safe.
virtual void preadv(
folly::Range<const common::Region*> regions,
folly::Range<folly::IOBuf*> iobufs) const;
// Like preadv but may execute asynchronously and returns the read
// size or exception via SemiFuture. Use hasPreadvAsync() to check
// if the implementation is in fact asynchronous.
//
// This method should be thread safe.
virtual folly::SemiFuture<uint64_t> preadvAsync(
uint64_t offset,
const std::vector<folly::Range<char*>>& buffers) const {
try {
return folly::SemiFuture<uint64_t>(preadv(offset, buffers));
} catch (const std::exception& e) {
return folly::makeSemiFuture<uint64_t>(e);
}
}
// Returns true if preadvAsync has a native implementation that is
// asynchronous. The default implementation is synchronous.
virtual bool hasPreadvAsync() const {
return false;
}
// Whether preads should be coalesced where possible. E.g. remote disk would
// set to true, in-memory to false.
virtual bool shouldCoalesce() const = 0;
// Number of bytes in the file.
virtual uint64_t size() const = 0;
// An estimate for the total amount of memory *this uses.
virtual uint64_t memoryUsage() const = 0;
// The total number of bytes *this had been used to read since creation or
// the last resetBytesRead. We sum all the |length| variables passed to
// preads, not the actual amount of bytes read (which might be less).
virtual uint64_t bytesRead() const {
return bytesRead_;
}
virtual void resetBytesRead() {
bytesRead_ = 0;
}
virtual std::string getName() const = 0;
//
// Get the natural size for reads.
// @return the number of bytes that should be read at once
//
virtual uint64_t getNaturalReadSize() const = 0;
protected:
mutable std::atomic<uint64_t> bytesRead_ = 0;
};
./velox/common/file/FileSystems.h
/// An abstract FileSystem
class FileSystem {
public:
FileSystem(std::shared_ptr<const Config> config)
: config_(std::move(config)) {}
virtual ~FileSystem() = default;
/// Returns the name of the File System
virtual std::string name() const = 0;
/// Returns a ReadFile handle for a given file path
virtual std::unique_ptr<ReadFile> openFileForRead(
std::string_view path,
const FileOptions& options = {}) = 0;
/// Returns a WriteFile handle for a given file path
virtual std::unique_ptr<WriteFile> openFileForWrite(
std::string_view path,
const FileOptions& options = {}) = 0;
/// Deletes the file at 'path'. Throws on error.
virtual void remove(std::string_view path) = 0;
/// Rename the file at 'path' to `newpath`. Throws on error. If 'overwrite' is
/// true, then rename does overwrite if file at 'newPath' already exists.
/// Throws a velox user exception on error.
virtual void rename(
std::string_view oldPath,
std::string_view newPath,
bool overwrite = false) = 0;
/// Returns true if the file exists.
virtual bool exists(std::string_view path) = 0;
/// Returns the list of files or folders in a path. Currently, this method
/// will be used for testing, but we will need change this to an iterator
/// output method to avoid potential heavy output if there are many entries in
/// the folder.
virtual std::vector<std::string> list(std::string_view path) = 0;
/// Create a directory (recursively). Throws velox exception on failure.
virtual void mkdir(std::string_view path) = 0;
/// Remove a directory (all the files and sub-directories underneath
/// recursively). Throws velox exception on failure.
virtual void rmdir(std::string_view path) = 0;
protected:
std::shared_ptr<const Config> config_;
};
S3ReadFile
// TODO: Implement retry on failure.
class S3ReadFile final : public ReadFile {
public:
S3ReadFile(const std::string& path, Aws::S3::S3Client* client)
: client_(client) {
getBucketAndKeyFromS3Path(path, bucket_, key_);
}
// Gets the length of the file.
// Checks if there are any issues reading the file.
void initialize() {
// Make it a no-op if invoked twice.
if (length_ != -1) {
return;
}
Aws::S3::Model::HeadObjectRequest request;
request.SetBucket(awsString(bucket_));
request.SetKey(awsString(key_));
auto outcome = client_->HeadObject(request);
VELOX_CHECK_AWS_OUTCOME(
outcome, "Failed to get metadata for S3 object", bucket_, key_);
length_ = outcome.GetResult().GetContentLength();
VELOX_CHECK_GE(length_, 0);
}
std::string_view pread(uint64_t offset, uint64_t length, void* buffer)
const override {
preadInternal(offset, length, static_cast<char*>(buffer));
return {static_cast<char*>(buffer), length};
}
std::string pread(uint64_t offset, uint64_t length) const override {
std::string result(length, 0);
char* position = result.data();
preadInternal(offset, length, position);
return result;
}
uint64_t preadv(
uint64_t offset,
const std::vector<folly::Range<char*>>& buffers) const override {
// 'buffers' contains Ranges(data, size) with some gaps (data = nullptr) in
// between. This call must populate the ranges (except gap ranges)
// sequentially starting from 'offset'. AWS S3 GetObject does not support
// multi-range. AWS S3 also charges by number of read requests and not size.
// The idea here is to use a single read spanning all the ranges and then
// populate individual ranges. We pre-allocate a buffer to support this.
size_t length = 0;
for (const auto range : buffers) {
length += range.size();
}
// TODO: allocate from a memory pool
std::string result(length, 0);
preadInternal(offset, length, static_cast<char*>(result.data()));
size_t resultOffset = 0;
for (auto range : buffers) {
if (range.data()) {
memcpy(range.data(), &(result.data()[resultOffset]), range.size());
}
resultOffset += range.size();
}
return length;
}
uint64_t size() const override {
return length_;
}
uint64_t memoryUsage() const override {
// TODO: Check if any buffers are being used by the S3 library
return sizeof(Aws::S3::S3Client) + kS3MaxKeySize + 2 * sizeof(std::string) +
sizeof(int64_t);
}
bool shouldCoalesce() const final {
return false;
}
std::string getName() const final {
return fmt::format("s3://{}/{}", bucket_, key_);
}
uint64_t getNaturalReadSize() const final {
return 72 << 20;
}
private:
// The assumption here is that "position" has space for at least "length"
// bytes.
void preadInternal(uint64_t offset, uint64_t length, char* position) const {
// Read the desired range of bytes.
Aws::S3::Model::GetObjectRequest request;
Aws::S3::Model::GetObjectResult result;
request.SetBucket(awsString(bucket_));
request.SetKey(awsString(key_));
std::stringstream ss;
ss << "bytes=" << offset << "-" << offset + length - 1;
request.SetRange(awsString(ss.str()));
request.SetResponseStreamFactory(
AwsWriteableStreamFactory(position, length));
auto outcome = client_->GetObject(request);
VELOX_CHECK_AWS_OUTCOME(outcome, "Failed to get S3 object", bucket_, key_);
}
Aws::S3::S3Client* client_;
std::string bucket_;
std::string key_;
int64_t length_ = -1;
};