1.安装conda,替换国内源
2.下载intel arrow:
git clone https://github.com/Intel-bigdata/arrow.git
cd arrow && git checkout native-sql-engine-clean
vim ci/conda_env_gandiva.yml
clangdev=7
llvmdev=7
conda create -y -n pyarrow-dev -c conda-forge \
--file ci/conda_env_unix.yml \
--file ci/conda_env_cpp.yml \
--file ci/conda_env_python.yml \
--file ci/conda_env_gandiva.yml \
compilers \
python=3.7 \
pandas
conda activate pyarrow-dev
3.下载test组件:
yum install gtest-devel
yum install gmock
4.编译arrow:
cd arrow && git checkout native-sql-engine-clean
git submodule update --init --recursive
mkdir -p arrow/cpp/release-build cd arrow/cpp/release-build
cmake -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON -DARROW_DATASET=ON .. make -j make install # build java cd ../../java # change property 'arrow.cpp.build.dir' to the relative path of cpp build dir in gandiva/pom.xml mvn clean install -P arrow-jni -am -Darrow.cpp.build.dir=../cpp/release-build/release/ -DskipTests # if you are behine proxy, please also add proxy for socks mvn clean install -P arrow-jni -am -Darrow.cpp.build.dir=../cpp/release-build/release/ -DskipTests -DsocksProxyHost=${proxyHost} -DsocksProxyPort=1080
5.编译OAP
git clone https://github.com/Intel-bigdata/OAP.git
cd OAP && git checkout branch-nativesql-spark-3.0.0
cd oap-native-sql
cd cpp/
mkdir build/
cd build/
cmake .. -DTESTS=ON
make -j
6.spark配置:
spark-default.conf:
spark.sql.sources.useV1SourceList avro
spark.sql.join.preferSortMergeJoin false
spark.sql.extensions com.intel.oap.ColumnarPlugin
spark.shuffle.manager org.apache.spark.shuffle.sort.ColumnarShuffleManager
spark.shuffle.compress true
spark.io.compression.codec lz4
spark.executorEnv.LD_LIBRARY_PATH ${ld_library_path}/libs
spark.executorEnv.ARROW_LIBHDFS3_DIR ${ld_library_path}/libs