Velox

Velox Build

make release EXTRA_CMAKE_FLAGS=" -DVELOX_ENABLE_PARQUET=ON -DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_BENCHMARKS=ON"

相关 options

CXXFLAGS="-Wno-error" 


add_compile_options(-g -O0 -Wno-error -Wno-sign-compare -Wno-unused-function)

UT 所在目录
_build/release/velox/exec/tests
Velox Functions

velox/velox/functions/Macros.h

#define VELOX_UDF_BEGIN(Name)                                                \
  struct udf_##Name {                                                        \
    template <typename __Velox_ExecParams>                                   \
    struct udf {                                                             \
      template <typename __Velox_TArg>                                       \
      using arg_type = typename __Velox_ExecParams::template resolver<       \
          __Velox_TArg>::in_type;                                            \
                                                                             \
      template <typename __Velox_TArg>                                       \
      using out_type = typename __Velox_ExecParams::template resolver<       \
          __Velox_TArg>::out_type;                                           \
                                                                             \
      template <typename __Velox_TArg>                                       \
      using opt_arg_type =                                                   \
          std::optional<typename __Velox_ExecParams::template resolver<      \
              __Velox_TArg>::in_type>;                                       \
                                                                             \
      template <typename __Velox_TArg>                                       \
      using opt_out_type =                                                   \
          std::optional<typename __Velox_ExecParams::template resolver<      \
              __Velox_TArg>::out_type>;                                      \
                                                                             \
      template <typename __Velox_TKey, typename __Velox_TVal>                \
      using MapVal =                                                         \
          arg_type<::facebook::velox::Map<__Velox_TKey, __Velox_TVal>>;      \
      template <typename __Velox_TElement>                                   \
      using ArrayVal = arg_type<::facebook::velox::Array<__Velox_TElement>>; \
      using VarcharVal = arg_type<::facebook::velox::Varchar>;               \
      using VarbinaryVal = arg_type<::facebook::velox::Varbinary>;           \
      template <typename... __Velox_TArgs>                                   \
      using RowVal = arg_type<::facebook::velox::Row<__Velox_TArgs...>>;     \
      template <typename __Velox_TKey, typename __Velox_TVal>                \
      using MapWriter =                                                      \
          out_type<::facebook::velox::Map<__Velox_TKey, __Velox_TVal>>;      \
      template <typename __Velox_TElement>                                   \
      using ArrayWriter =                                                    \
          out_type<::facebook::velox::Array<__Velox_TElement>>;              \
      using VarcharWriter = out_type<::facebook::velox::Varchar>;            \
      using VarbinaryWriter = out_type<::facebook::velox::Varbinary>;        \
      template <typename... __Velox_TArgs>                                   \
      using RowWriter = out_type<::facebook::velox::Row<__Velox_TArgs...>>;  \
      static constexpr auto name = #Name;

#define VELOX_UDF_END() \
  }                     \
  ;                     \
  }                     \
  ;

Functions Registration

velox/velox/functions/lib/RegistrationHelpers.h

template <template <class> class T>
void registerUnaryNumeric(const std::vector<std::string>& aliases) {
  registerUnaryIntegral<T>(aliases);
  registerUnaryFloatingPoint<T>(aliases);
}
template <template <class> class T>
void registerUnaryIntegral(const std::vector<std::string>& aliases) {
  registerFunction<T, int8_t, int8_t>(aliases);
  registerFunction<T, int16_t, int16_t>(aliases);
  registerFunction<T, int32_t, int32_t>(aliases);
  registerFunction<T, int64_t, int64_t>(aliases);
}
template <template <class> class T>
void registerUnaryFloatingPoint(const std::vector<std::string>& aliases) {
  registerFunction<T, double, double>(aliases);
  registerFunction<T, float, float>(aliases);
}
template <typename Func, typename TReturn, typename... TArgs>
void registerFunction(
    const std::vector<std::string>& aliases = {},
    std::shared_ptr<const Type> returnType = nullptr) {
  using funcClass = typename Func::template udf<exec::VectorExec>;
  using holderClass =
      core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
  exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}

// New registration function; mostly a copy from the function above, but taking
// the inner "udf" struct directly, instead of the wrapper. We can keep both for
// a while to maintain backwards compatibility, but the idea is to remove the
// one above eventually.
template <template <class> typename Func, typename TReturn, typename... TArgs>
void registerFunction(
    const std::vector<std::string>& aliases = {},
    std::shared_ptr<const Type> returnType = nullptr) {
  using funcClass = Func<exec::VectorExec>;
  using holderClass =
      core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
  exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}
VELOX_REGISTER_VECTOR_FUNCTION

velox/velox/expression/VectorFunction.h

// Registers a vectorized UDF associated with a given tag.
// This should be used in the same namespace the declare macro is used in.
#define VELOX_REGISTER_VECTOR_FUNCTION(tag, name)                   \
  {                                                                 \
    extern void _VELOX_REGISTER_FUNC_NAME(tag)(const std::string&); \
    _VELOX_REGISTER_FUNC_NAME(tag)(name);                           \
  }
Register

./velox/functions/Registerer.h

template <typename Func, typename TReturn, typename... TArgs>
void registerFunction(
    const std::vector<std::string>& aliases = {},
    std::shared_ptr<const Type> returnType = nullptr) {
  using funcClass = typename Func::template udf<exec::VectorExec>;
  using holderClass =
      core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
  exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}

// New registration function; mostly a copy from the function above, but taking
// the inner "udf" struct directly, instead of the wrapper. We can keep both for
// a while to maintain backwards compatibility, but the idea is to remove the
// one above eventually.
template <template <class> typename Func, typename TReturn, typename... TArgs>
void registerFunction(
    const std::vector<std::string>& aliases = {},
    std::shared_ptr<const Type> returnType = nullptr) {
  using funcClass = Func<exec::VectorExec>;
  using holderClass =
      core::UDFHolder<funcClass, exec::VectorExec, TReturn, TArgs...>;
  exec::registerSimpleFunction<holderClass>(aliases, move(returnType));
}

./velox/expression/SimpleFunctionRegistry.h

// This function should be called once and alone.
template <typename UDFHolder>
void registerSimpleFunction(
    const std::vector<std::string>& names,
    std::shared_ptr<const Type> returnType) {
  SimpleFunctions()
      .registerFunction<SimpleFunctionAdapterFactoryImpl<UDFHolder>>(
          names, returnType);
}

./velox/expression/FunctionRegistry.h

public:
  template <typename UDF>
  void registerFunction(
      const std::vector<std::string>& aliases = {},
      std::shared_ptr<const Type> returnType = nullptr) {
    auto metadata =
        GetSingletonUdfMetadata<typename UDF::Metadata>(std::move(returnType));
    auto&& names = aliases.empty()
        ? std::vector<std::string>{metadata->getName()}
        : aliases;

    registerFunctionInternal(names, metadata, [metadata]() {
      return CreateUdf<UDF>(metadata->returnType());
    });
  }



....


void registerFunctionInternal(
      const std::vector<std::string>& names,
      const std::shared_ptr<const Metadata>& metadata,
      typename FunctionEntry<Function, Metadata>::FunctionFactory factory) {
    for (const auto& name : names) {
      auto sanitizedName = sanitizeFunctionName(name);

      if (registeredFunctions_.find(sanitizedName) ==
          registeredFunctions_.end()) {
        registeredFunctions_[sanitizedName] = SignatureMap{};
      }

      registeredFunctions_[sanitizedName][*metadata->signature()] =
          std::make_unique<const FunctionEntry<Function, Metadata>>(
              metadata, std::move(factory));
    }
  }




Get

./velox/functions/FunctionRegistry.cpp

FunctionSignatureMap getFunctionSignatures() {
  FunctionSignatureMap result;
  populateSimpleFunctionSignatures(result);
  populateVectorFunctionSignatures(result);
  return result;
}

...


void populateSimpleFunctionSignatures(FunctionSignatureMap& map) {
  auto& simpleFunctions = exec::SimpleFunctions();
  auto functionNames = simpleFunctions.getFunctionNames();
  for (const auto& functionName : functionNames) {
    auto signatures = simpleFunctions.getFunctionSignatures(functionName);
    map[functionName] = signatures;
  }
}

./velox/expression/FunctionRegistry.h

std::vector<const FunctionSignature*> getFunctionSignatures(
      const std::string& name) {
    std::vector<const FunctionSignature*> signatures;
    if (auto signatureMap = getSignatureMap(name)) {
      signatures.reserve(signatureMap->size());
      for (const auto& pair : *signatureMap) {
        signatures.emplace_back(&pair.first);
      }
    }

    return signatures;
  }


SignatureMap* getSignatureMap(const std::string& name) {
    auto sanitizedName = sanitizeFunctionName(name);

    auto it = registeredFunctions_.find(sanitizedName);
    if (it != registeredFunctions_.end()) {
      return &it->second;
    }

    return nullptr;
  }
Velox evaluateOnce trace

FunctionBaseTest.h:
evaluateOnce

template <typename ReturnType, typename... Args>
  std::optional<ReturnType> evaluateOnce(
      const std::string& expr,
      const RowVectorPtr rowVectorPtr) {
    auto result =
        evaluate<SimpleVector<EvalType<ReturnType>>>(expr, rowVectorPtr);
    return result->isNullAt(0) ? std::optional<ReturnType>{}
                               : ReturnType(result->valueAt(0));
  }

evaluate

template <typename T>
  std::shared_ptr<T> evaluate(
      const std::string& expression,
      const RowVectorPtr& data) {
    auto result = evaluate(expression, data);
    return castEvaluateResult<T>(result, expression);
  }

 // Use this directly if you don't want it to cast the returned vector.
  VectorPtr evaluate(const std::string& expression, const RowVectorPtr& data) {
    auto rowType = std::dynamic_pointer_cast<const RowType>(data->type());
    auto typedExpr = makeTypedExpr(expression, rowType);

    return evaluate(typedExpr, data);
  }
core::TypedExprPtr makeTypedExpr(
      const std::string& text,
      const RowTypePtr& rowType) {
    auto untyped = parse::parseExpr(text, options_);
    return core::Expressions::inferTypes(untyped, rowType, execCtx_.pool());
  }

Expressions.cpp

// static
TypedExprPtr Expressions::inferTypes(
    const std::shared_ptr<const core::IExpr>& expr,
    const TypePtr& inputRow,
    memory::MemoryPool* pool,
    const VectorPtr& complexConstants) {
  return inferTypes(expr, inputRow, {}, pool, complexConstants);
}
TypedExprPtr createWithImplicitCast(
    const std::shared_ptr<const core::CallExpr>& expr,
    const std::vector<TypedExprPtr>& inputs) {
  auto adjusted = adjustLastNArguments(inputs, expr, inputs.size());
  if (adjusted) {
    return adjusted;
  }
  auto type = resolveTypeImpl(inputs, expr, false /*nullOnFailure*/);
  return std::make_shared<CallTypedExpr>(
      type, std::move(inputs), std::string{expr->getFunctionName()});
}
// Determine output type based on input types.
TypePtr resolveTypeImpl(
    std::vector<TypedExprPtr> inputs,
    const std::shared_ptr<const CallExpr>& expr,
    bool nullOnFailure) {
  VELOX_CHECK_NOT_NULL(Expressions::getResolverHook());
  return Expressions::getResolverHook()(inputs, expr, nullOnFailure);
}

/usr/include/c++/9/bits/std_function.h:688

  template<typename _Res, typename... _ArgTypes>
    _Res
    function<_Res(_ArgTypes...)>::
    operator()(_ArgTypes... __args) const
    {
      if (_M_empty())
        __throw_bad_function_call();
      return _M_invoker(_M_functor, std::forward<_ArgTypes>(__args)...);
    }

ArrayVector, MapVector and RowVector

Flat vectors of complex types ARRAY, MAP and ROW / STRUCT are represented using ArrayVector, MapVector and RowVector.

RowVector

Finally, RowVector stores values of type ROW (e.g. structs). In addition to the nulls buffer, it contains a list of child vectors.

std::vector<VectorPtr> children_;

Here is an example.
在这里插入图片描述
RowVector is used to represent a single column of type struct as well as a collection of columns that are being passed from one operator to the next during query execution.

// Let’s create two vectors of 64-bit integers and one vector of strings.
  auto a = makeFlatVector<int64_t>({0, 1, 2, 3, 4, 5, 6});
  auto b = makeFlatVector<int64_t>({0, 5, 10, 15, 20, 25, 30});
  auto dow = makeFlatVector<std::string>(
      {"monday",
       "tuesday",
       "wednesday",
       "thursday",
       "friday",
       "saturday",
       "sunday"});

  auto data = makeRowVector({"a", "b", "dow"}, {a, b, dow});

  std::cout << std::endl
            << "> vectors a, b, dow: " << data->toString() << std::endl;
  std::cout << data->toString(0, data->size()) << std::endl;
文件读写

./velox/common/file/File.h

// A read-only file.  All methods in this object should be thread safe.
class ReadFile {
 public:
  virtual ~ReadFile() = default;

  // Reads the data at [offset, offset + length) into the provided pre-allocated
  // buffer 'buf'. The bytes are returned as a string_view pointing to 'buf'.
  //
  // This method should be thread safe.
  virtual std::string_view
  pread(uint64_t offset, uint64_t length, void* FOLLY_NONNULL buf) const = 0;

  // Same as above, but returns owned data directly.
  //
  // This method should be thread safe.
  virtual std::string pread(uint64_t offset, uint64_t length) const;

  // Reads starting at 'offset' into the memory referenced by the
  // Ranges in 'buffers'. The buffers are filled left to right. A
  // buffer with nullptr data will cause its size worth of bytes to be skipped.
  //
  // This method should be thread safe.
  virtual uint64_t preadv(
      uint64_t /*offset*/,
      const std::vector<folly::Range<char*>>& /*buffers*/) const;

  // Vectorized read API. Implementations can coalesce and parallelize.
  // The offsets don't need to be sorted.
  // `iobufs` is a range of IOBufs to store the read data. They
  // will be stored in the same order as the input `regions` vector. So the
  // array must be pre-allocated by the caller, with the same size as `regions`,
  // but don't need to be initialized, since each iobuf will be copy-constructed
  // by the preadv.
  //
  // This method should be thread safe.
  virtual void preadv(
      folly::Range<const common::Region*> regions,
      folly::Range<folly::IOBuf*> iobufs) const;

  // Like preadv but may execute asynchronously and returns the read
  // size or exception via SemiFuture. Use hasPreadvAsync() to check
  // if the implementation is in fact asynchronous.
  //
  // This method should be thread safe.
  virtual folly::SemiFuture<uint64_t> preadvAsync(
      uint64_t offset,
      const std::vector<folly::Range<char*>>& buffers) const {
    try {
      return folly::SemiFuture<uint64_t>(preadv(offset, buffers));
    } catch (const std::exception& e) {
      return folly::makeSemiFuture<uint64_t>(e);
    }
  }

  // Returns true if preadvAsync has a native implementation that is
  // asynchronous. The default implementation is synchronous.
  virtual bool hasPreadvAsync() const {
    return false;
  }

  // Whether preads should be coalesced where possible. E.g. remote disk would
  // set to true, in-memory to false.
  virtual bool shouldCoalesce() const = 0;

  // Number of bytes in the file.
  virtual uint64_t size() const = 0;

  // An estimate for the total amount of memory *this uses.
  virtual uint64_t memoryUsage() const = 0;

  // The total number of bytes *this had been used to read since creation or
  // the last resetBytesRead. We sum all the |length| variables passed to
  // preads, not the actual amount of bytes read (which might be less).
  virtual uint64_t bytesRead() const {
    return bytesRead_;
  }

  virtual void resetBytesRead() {
    bytesRead_ = 0;
  }

  virtual std::string getName() const = 0;

  //
  // Get the natural size for reads.
  // @return the number of bytes that should be read at once
  //
  virtual uint64_t getNaturalReadSize() const = 0;

 protected:
  mutable std::atomic<uint64_t> bytesRead_ = 0;
};

./velox/common/file/FileSystems.h

/// An abstract FileSystem
class FileSystem {
 public:
  FileSystem(std::shared_ptr<const Config> config)
      : config_(std::move(config)) {}
  virtual ~FileSystem() = default;

  /// Returns the name of the File System
  virtual std::string name() const = 0;

  /// Returns a ReadFile handle for a given file path
  virtual std::unique_ptr<ReadFile> openFileForRead(
      std::string_view path,
      const FileOptions& options = {}) = 0;

  /// Returns a WriteFile handle for a given file path
  virtual std::unique_ptr<WriteFile> openFileForWrite(
      std::string_view path,
      const FileOptions& options = {}) = 0;

  /// Deletes the file at 'path'. Throws on error.
  virtual void remove(std::string_view path) = 0;

  /// Rename the file at 'path' to `newpath`. Throws on error. If 'overwrite' is
  /// true, then rename does overwrite if file at 'newPath' already exists.
  /// Throws a velox user exception on error.
  virtual void rename(
      std::string_view oldPath,
      std::string_view newPath,
      bool overwrite = false) = 0;

  /// Returns true if the file exists.
  virtual bool exists(std::string_view path) = 0;

  /// Returns the list of files or folders in a path. Currently, this method
  /// will be used for testing, but we will need change this to an iterator
  /// output method to avoid potential heavy output if there are many entries in
  /// the folder.
  virtual std::vector<std::string> list(std::string_view path) = 0;

  /// Create a directory (recursively). Throws velox exception on failure.
  virtual void mkdir(std::string_view path) = 0;

  /// Remove a directory (all the files and sub-directories underneath
  /// recursively). Throws velox exception on failure.
  virtual void rmdir(std::string_view path) = 0;

 protected:
  std::shared_ptr<const Config> config_;
};
S3ReadFile
// TODO: Implement retry on failure.
class S3ReadFile final : public ReadFile {
 public:
  S3ReadFile(const std::string& path, Aws::S3::S3Client* client)
      : client_(client) {
    getBucketAndKeyFromS3Path(path, bucket_, key_);
  }

  // Gets the length of the file.
  // Checks if there are any issues reading the file.
  void initialize() {
    // Make it a no-op if invoked twice.
    if (length_ != -1) {
      return;
    }

    Aws::S3::Model::HeadObjectRequest request;
    request.SetBucket(awsString(bucket_));
    request.SetKey(awsString(key_));

    auto outcome = client_->HeadObject(request);
    VELOX_CHECK_AWS_OUTCOME(
        outcome, "Failed to get metadata for S3 object", bucket_, key_);
    length_ = outcome.GetResult().GetContentLength();
    VELOX_CHECK_GE(length_, 0);
  }

  std::string_view pread(uint64_t offset, uint64_t length, void* buffer)
      const override {
    preadInternal(offset, length, static_cast<char*>(buffer));
    return {static_cast<char*>(buffer), length};
  }

  std::string pread(uint64_t offset, uint64_t length) const override {
    std::string result(length, 0);
    char* position = result.data();
    preadInternal(offset, length, position);
    return result;
  }

  uint64_t preadv(
      uint64_t offset,
      const std::vector<folly::Range<char*>>& buffers) const override {
    // 'buffers' contains Ranges(data, size)  with some gaps (data = nullptr) in
    // between. This call must populate the ranges (except gap ranges)
    // sequentially starting from 'offset'. AWS S3 GetObject does not support
    // multi-range. AWS S3 also charges by number of read requests and not size.
    // The idea here is to use a single read spanning all the ranges and then
    // populate individual ranges. We pre-allocate a buffer to support this.
    size_t length = 0;
    for (const auto range : buffers) {
      length += range.size();
    }
    // TODO: allocate from a memory pool
    std::string result(length, 0);
    preadInternal(offset, length, static_cast<char*>(result.data()));
    size_t resultOffset = 0;
    for (auto range : buffers) {
      if (range.data()) {
        memcpy(range.data(), &(result.data()[resultOffset]), range.size());
      }
      resultOffset += range.size();
    }
    return length;
  }

  uint64_t size() const override {
    return length_;
  }

  uint64_t memoryUsage() const override {
    // TODO: Check if any buffers are being used by the S3 library
    return sizeof(Aws::S3::S3Client) + kS3MaxKeySize + 2 * sizeof(std::string) +
        sizeof(int64_t);
  }

  bool shouldCoalesce() const final {
    return false;
  }

  std::string getName() const final {
    return fmt::format("s3://{}/{}", bucket_, key_);
  }

  uint64_t getNaturalReadSize() const final {
    return 72 << 20;
  }

 private:
  // The assumption here is that "position" has space for at least "length"
  // bytes.
  void preadInternal(uint64_t offset, uint64_t length, char* position) const {
    // Read the desired range of bytes.
    Aws::S3::Model::GetObjectRequest request;
    Aws::S3::Model::GetObjectResult result;

    request.SetBucket(awsString(bucket_));
    request.SetKey(awsString(key_));
    std::stringstream ss;
    ss << "bytes=" << offset << "-" << offset + length - 1;
    request.SetRange(awsString(ss.str()));
    request.SetResponseStreamFactory(
        AwsWriteableStreamFactory(position, length));
    auto outcome = client_->GetObject(request);
    VELOX_CHECK_AWS_OUTCOME(outcome, "Failed to get S3 object", bucket_, key_);
  }

  Aws::S3::S3Client* client_;
  std::string bucket_;
  std::string key_;
  int64_t length_ = -1;
};
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值