Arrow 编译和测试

zhixingheyi_tian

已于 2023-01-12 15:50:25 修改

阅读量922

点赞数

分类专栏：云计算文章标签：大数据

于 2022-07-14 14:15:10 首次发布

本文链接：https://blog.csdn.net/zhixingheyi_tian/article/details/125782791

版权

云计算专栏收录该内容

92 篇文章 0 订阅

订阅专栏

Arrow build

https://github.com/apache/arrow/blob/apache-arrow-4.0.0/docs/source/developers/cpp/building.rst

Minimal debug build with unit tests:

git clone https://github.com/apache/arrow.git
cd arrow/cpp
mkdir debug
cd debug
cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON ..
make -j

# find ./ -name "*-test"
./debug/arrow-public-api-test
./debug/arrow-arrow-jniutil-test
./debug/arrow-io-compressed-test
./debug/arrow-extension-type-test
./debug/arrow-io-buffered-test
./debug/arrow-misc-test
./debug/arrow-stl-test
./debug/arrow-io-file-test
...

Build Test with ORC

cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON -DARROW_ORC=ON -DARROW_COMPUTE=ON -DARROW_DATASET=ON ..

Build Test with Parquet

cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON -DARROW_PARQUET=ON -DARROW_COMPUTE=ON -DARROW_DATASET=ON ..

# Adding unit tests part of the "dataset" portion of the test suite
function(ADD_ARROW_DATASET_TEST REL_TEST_NAME)
  set(options)
  set(one_value_args PREFIX)
  set(multi_value_args LABELS)
  cmake_parse_arguments(ARG
                        "${options}"
                        "${one_value_args}"
                        "${multi_value_args}"
                        ${ARGN})

  if(ARG_PREFIX)
    set(PREFIX ${ARG_PREFIX})
  else()
    set(PREFIX "arrow-dataset")
  endif()

  if(ARG_LABELS)
    set(LABELS ${ARG_LABELS})
  else()
    set(LABELS "arrow_dataset")
  endif()

  add_arrow_test(${REL_TEST_NAME}
                 EXTRA_LINK_LIBS
                 ${ARROW_DATASET_TEST_LINK_LIBS}
                 PREFIX
                 ${PREFIX}
                 LABELS
                 ${LABELS}
                 ${ARG_UNPARSED_ARGUMENTS})
endfunction()

if(ARROW_PARQUET)
  add_arrow_dataset_test(file_parquet_test)
endif()

git submodule update --init
export PARQUET_TEST_DATA=${ARROW}/cpp/submodules/parquet-testing/data

# ./debug/arrow-dataset-file-parquet-test
Running main() from /home/shen/software/googletest/googletest/src/gtest_main.cc
[==========] Running 19 tests from 2 test suites.
[----------] Global test environment set-up.
[----------] 15 tests from TestParquetFileFormat
[ RUN      ] TestParquetFileFormat.ScanRecordBatchReader
[       OK ] TestParquetFileFormat.ScanRecordBatchReader (99 ms)
[ RUN      ] TestParquetFileFormat.ScanRecordBatchReaderDictEncoded
[       OK ] TestParquetFileFormat.ScanRecordBatchReaderDictEncoded (611 ms)
[ RUN      ] TestParquetFileFormat.ScanRecordBatchReaderPreBuffer
[       OK ] TestParquetFileFormat.ScanRecordBatchReaderPreBuffer (78 ms)

Build Benchmark

cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON -DARROW_BUILD_BENCHMARKS=ON -DARROW_JEMALLOC=OFF -DARROW_PARQUET=ON -DARROW_COMPUTE=ON -DARROW_DATASET=ON ..

Build Type

# compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE=<type> .')
# For all builds:
# For CMAKE_BUILD_TYPE=Debug
#   -ggdb: Enable gdb debugging
# For CMAKE_BUILD_TYPE=FastDebug
#   Same as DEBUG, except with some optimizations on.
# For CMAKE_BUILD_TYPE=Release
#   -O3: Enable all compiler optimizations
#   Debug symbols are stripped for reduced binary size. Add
#   -DARROW_CXXFLAGS="-g" to add them

./parquet-arrow-reader-writer-benchmark

BM_ReadColumn<true,BooleanType>/5/10                       387778965 ns    386866607 ns            2 bytes_per_second=25.8487M/s items_per_second=27.1043M/s
BM_ReadBinaryColumn/null_probability:0/unique_values:32   3417224521 ns   3409209551 ns            1 bytes_per_second=53.5429M/s items_per_second=3.07572M/s
BM_ReadBinaryColumn/null_probability:0/unique_values:-1   1423706251 ns   1420369542 ns            1 bytes_per_second=151.331M/s items_per_second=7.38242M/s
BM_ReadBinaryColumn/null_probability:1/unique_values:32   3796615016 ns   3787710745 ns            1 bytes_per_second=47.8159M/s items_per_second=2.76836M/s
BM_ReadBinaryColumn/null_probability:50/unique_values:32  3129534432 ns   3122194897 ns            1 bytes_per_second=35.6467M/s items_per_second=3.35846M/s
BM_ReadBinaryColumn/null_probability:99/unique_values:32  1907305272 ns   1902829090 ns            1 bytes_per_second=21.7693M/s items_per_second=5.51062M/s
BM_ReadBinaryColumn/null_probability:1/unique_values:-1   1569981873 ns   1566302194 ns            1 bytes_per_second=136.12M/s items_per_second=6.6946M/s
BM_ReadBinaryColumn/null_probability:50/unique_values:-1  1576715826 ns   1573015486 ns            1 bytes_per_second=81.0747M/s items_per_second=6.66602M/s
BM_ReadBinaryColumn/null_probability:99/unique_values:-1  1383394533 ns   1380152318 ns            1 bytes_per_second=30.2516M/s items_per_second=7.59754M/s
BM_ReadStructColumn/0                                       38597186 ns     38506719 ns           18 bytes_per_second=311.634M/s items_per_second=27.231M/s