1) Build
## download spark-3.2.1.tgz from http://archive.apache.org/dist/
## unpack to ~/work/spark-3.2.1-src
$ cd ~/work/spark-3.2.1-src
$ export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
$ dev/make-distribution.sh --name without-hadoop \
--pip --tgz -Phive -Phive-thriftserver -Phadoop-provided -Pyarn
$ tar xvf spark-3.2.1-bin-without-hadoop.tgz -C ..
$ cd ..
$ mv spark-3.2.1-bin-without-hadoop spark-3.2.1
## configure
$ cd spark-3.2.1
$ diff -u conf/spark-env.sh.template conf/spark-env.sh
--- conf/spark-env.sh.template 2022-06-24 09:16:18.000000000 +0800
+++ conf/spark-env.sh 2022-06-24 17:52:47.000000000 +0800
@@ -71,3 +71,7 @@
# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS
+
+JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
+SPARK_LOCAL_IP=localhost
+SPARK_DIST_CLASSPATH=`hadoop classpath`
$ diff -u conf/log4j.properties.template conf/log4j.properties
$ diff -u conf/log4j.properties.template conf/log4j.properties
--- conf/log4j.properties.template 2022-06-24 09:16:18.000000000 +0800
+++ conf/log4j.properties 2022-06-24 16:28:28.000000000 +0800
@@ -16,7 +16,7 @@
#
# Set everything to be logged to the console
-log4j.rootCategory=INFO, console
+log4j.rootCategory=WARN, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
## test
$ `hadoop classpath` bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
examples/jars/spark-examples_2.12-3.2.1.jar 10
22/06/24 17:53:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Pi is roughly 3.1387311387311385
2) Spark on yarn
## configure yarn
$ cd ~/work/hadoop
$ diff -u etc/hadoop/yarn-site.xml.orig etc/hadoop/yarn-site.xml
--- etc/hadoop/yarn-site.xml 2022-05-17 09:20:54.000000000 +0800
+++ /Users/sun_xo/work/hadoop/etc/hadoop/yarn-site.xml 2022-06-23 10:13:52.000000000 +0800
@@ -29,4 +29,17 @@
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
+ <property>
+ <name>yarn.log.server.url</name>
+ <value>http://localhost:19888/jobhistory/logs</value>
+ </property>
+ <!-- close yarn memory check -->
+ <property>
+ <name>yarn.nodemanager.pmem-check-enabled</name>
+ <value>false</value>
+ </property>
+ <property>
+ <name>yarn.nodemanager.vmem-check-enabled</name>
+ <value>false</value>
+ </property>
</configuration>
## configure spark
$ diff -u spark-env.sh.template spark-env.sh
--- spark-env.sh.template 2022-06-24 09:16:18.000000000 +0800
+++ spark-env.sh 2022-06-24 18:49:42.000000000 +0800
@@ -71,3 +71,10 @@
# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS
+
+JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
+SPARK_LOCAL_IP=localhost
+SPARK_DIST_CLASSPATH=`hadoop classpath`
+HADOOP_CONF_DIR=~/work/hadoop/etc/hadoop
+YARN_CONF_DIR=$HADOOP_CONF_DIR
+SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=hdfs://localhost:9000/user/spark/logs/ -Dspark.history.fs.cleaner.enabled=true"
$ diff -u spark-defaults.conf.template spark-defaults.conf
--- spark-defaults.conf.template 2022-06-24 09:16:18.000000000 +0800
+++ spark-defaults.conf 2022-06-24 16:19:02.000000000 +0800
@@ -25,3 +25,8 @@
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 5g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+spark.eventLog.enabled true
+spark.eventLog.dir hdfs://localhost:9000/user/spark/logs
+spark.yarn.historyServer.address localhost:18080
+spark.yarn.jars hdfs://localhost:9000/user/spark/jars/*
## create dirs and upload spark jars to HDFS
$ hdfs dfs -mkdir -p /user/spark
$ hdfs dfs -put jars /user/spark
$ hdfs dfs -mkdir -p /user/spark/logs
## restart yarn with JobHistoryServer and spark HistoryServer
$ start-yarn.sh
$ mr-jobhistory-daemon.sh start historyserver
$ sbin/start-history-server.sh
$ jps
5696 SecondaryNameNode
5955 JobHistoryServer
5509 NameNode
5813 ResourceManager
5899 NodeManager
6683 HistoryServer
5597 DataNode
6702 Jps
## test
$ cat test.sh
#!/bin/sh
run() {
bin/spark-submit \
--master yarn \
--deploy-mode cluster \
--driver-memory 512m \
--executor-memory 512m \
--num-executors 1 \
--class org.apache.spark.examples.SparkPi \
examples/jars/spark-examples_2.12-3.2.1.jar 10
}
## main ##
run
appid=`grep "APPID" $HADOOP_HOME/logs/yarn*.log | tail -1 | awk 'pirnt $NF'`
appid=${appid#*APPID=}
echo $appid
$HADOOP_HOME/bin/yarn logs -applicationId $appid
$ test.sh
22/06/25 09:42:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
application_1656115668743_0003
22/06/25 09:43:05 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/25 09:43:06 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
Container: container_1656115668743_0003_01_000001 on 192.168.124.7_52592
==========================================================================
LogType:stderr
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:379
Log Contents:
22/06/25 09:42:54 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
22/06/25 09:42:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/25 09:42:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
End of LogType:stderr
LogType:stdout
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:33
Log Contents:
Pi is roughly 3.1423911423911424
End of LogType:stdout
Container: container_1656115668743_0003_01_000002 on 192.168.124.7_52592
==========================================================================
LogType:stderr
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:379
Log Contents:
22/06/25 09:43:00 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
22/06/25 09:43:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/25 09:43:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
End of LogType:stderr
LogType:stdout
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:0
Log Contents:
End of LogType:stdout
Actually the output of program is "Pi is roughly 3.1423911423911424"
or you can see same result from http://localhost:8088/cluster -> appid -> logs
reference Overview - Spark 3.2.1 Documentation