编译spark 2.1.0源码
- 准备环境:
- 准备spark源码包:https://archive.apache.org/dist/spark/spark-2.1.0/spark-2.1.0.tgz
- 准备maven-3.3.9以上
- scala-2.11.8:https://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.tgz
- 设置mavne缓存
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
- CDH版编译在pom.xml加入pom依赖:
# 在加压后的spark-2.1.0目录下,有个pom.xml文件
<repository>
<id>cloudera</id>
<name>cloudera Repository</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
# 修改文件./dev/make-distribution.sh
# 下面这些都注释掉:
###############################################
#VERSION=$("$MVN" help:evaluate -Dexpression=project.version $@ 2>/dev/null | grep -v "INFO" | tail -n 1)
#SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version $@ 2>/dev/null\
# | grep -v "INFO"\
# | tail -n 1)
#SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
# | grep -v "INFO"\
# | tail -n 1)
#SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
# | grep -v "INFO"\
# | fgrep --count "<id>hive</id>";\
# Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
# because we use "set -o pipefail"
# echo -n)
###############################################
# 手动配置好参数:
VERSION=2.1.0 # spark版本
SCALA_VERSION=2.11.8 # scala版本
SPARK_HADOOP_VERSION=2.6.0-cdh5.16.1 # hadoop版本
SPARK_HIVE=1 # hive版本
./dev/make-distribution.sh \
--name 2.6.0-cdh5.16.1 \
--tgz \
-Pyarn -Phadoop-2.6 \
-Dhadoop.version=2.6.0-cdh5.16.1 \
-Phive -Phive-thriftserver -Dhive.version=1.1.0-cdh5.16.1 \
-DskipTests clean package