搭建Spark on yarn环境

1) Build
## download spark-3.2.1.tgz from http://archive.apache.org/dist/
## unpack to ~/work/spark-3.2.1-src
$ cd ~/work/spark-3.2.1-src
$ export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
$ dev/make-distribution.sh --name without-hadoop \
    --pip --tgz -Phive -Phive-thriftserver -Phadoop-provided -Pyarn
$ tar xvf spark-3.2.1-bin-without-hadoop.tgz -C ..
$ cd ..
$ mv spark-3.2.1-bin-without-hadoop spark-3.2.1

## configure
$ cd spark-3.2.1
$ diff -u conf/spark-env.sh.template conf/spark-env.sh

--- conf/spark-env.sh.template	2022-06-24 09:16:18.000000000 +0800
+++ conf/spark-env.sh	2022-06-24 17:52:47.000000000 +0800
@@ -71,3 +71,7 @@
 # You might get better performance to enable these options if using native BLAS (see SPARK-21305).
 # - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
 # - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
+
+JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
+SPARK_LOCAL_IP=localhost
+SPARK_DIST_CLASSPATH=`hadoop classpath`

$ diff -u conf/log4j.properties.template conf/log4j.properties

$ diff -u conf/log4j.properties.template conf/log4j.properties
--- conf/log4j.properties.template	2022-06-24 09:16:18.000000000 +0800
+++ conf/log4j.properties	2022-06-24 16:28:28.000000000 +0800
@@ -16,7 +16,7 @@
 #
 
 # Set everything to be logged to the console
-log4j.rootCategory=INFO, console
+log4j.rootCategory=WARN, console
 log4j.appender.console=org.apache.log4j.ConsoleAppender
 log4j.appender.console.target=System.err
 log4j.appender.console.layout=org.apache.log4j.PatternLayout

## test
$ `hadoop classpath` bin/spark-submit \
    --class org.apache.spark.examples.SparkPi \
    examples/jars/spark-examples_2.12-3.2.1.jar 10

22/06/24 17:53:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Pi is roughly 3.1387311387311385

2) Spark on yarn
## configure yarn
$ cd ~/work/hadoop
$ diff -u etc/hadoop/yarn-site.xml.orig etc/hadoop/yarn-site.xml

--- etc/hadoop/yarn-site.xml    2022-05-17 09:20:54.000000000 +0800
+++ /Users/sun_xo/work/hadoop/etc/hadoop/yarn-site.xml  2022-06-23 10:13:52.000000000 +0800
@@ -29,4 +29,17 @@
         <name>yarn.log-aggregation.retain-seconds</name>
         <value>604800</value>
     </property>
+    <property>
+        <name>yarn.log.server.url</name>
+        <value>http://localhost:19888/jobhistory/logs</value>
+    </property>
+    <!-- close yarn memory check -->
+    <property>
+        <name>yarn.nodemanager.pmem-check-enabled</name>
+        <value>false</value>
+    </property>
+    <property>
+        <name>yarn.nodemanager.vmem-check-enabled</name>
+        <value>false</value>
+    </property>
 </configuration>

## configure spark
$ diff -u spark-env.sh.template spark-env.sh

--- spark-env.sh.template   2022-06-24 09:16:18.000000000 +0800
+++ spark-env.sh    2022-06-24 18:49:42.000000000 +0800
@@ -71,3 +71,10 @@
 # You might get better performance to enable these options if using native BLAS (see SPARK-21305).
 # - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
 # - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
+
+JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk1.8.0_321.jdk/Contents/Home
+SPARK_LOCAL_IP=localhost
+SPARK_DIST_CLASSPATH=`hadoop classpath`
+HADOOP_CONF_DIR=~/work/hadoop/etc/hadoop
+YARN_CONF_DIR=$HADOOP_CONF_DIR
+SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=hdfs://localhost:9000/user/spark/logs/ -Dspark.history.fs.cleaner.enabled=true"

$ diff -u spark-defaults.conf.template spark-defaults.conf

--- spark-defaults.conf.template    2022-06-24 09:16:18.000000000 +0800
+++ spark-defaults.conf 2022-06-24 16:19:02.000000000 +0800
@@ -25,3 +25,8 @@
 # spark.serializer                 org.apache.spark.serializer.KryoSerializer
 # spark.driver.memory              5g
 # spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+spark.eventLog.enabled             true
+spark.eventLog.dir                  hdfs://localhost:9000/user/spark/logs
+spark.yarn.historyServer.address    localhost:18080
+spark.yarn.jars                     hdfs://localhost:9000/user/spark/jars/*

## create dirs and upload spark jars to HDFS
$ hdfs dfs -mkdir -p /user/spark
$ hdfs dfs -put jars /user/spark
$ hdfs dfs -mkdir -p /user/spark/logs

## restart yarn with JobHistoryServer and spark HistoryServer
$ start-yarn.sh
$ mr-jobhistory-daemon.sh start historyserver
$ sbin/start-history-server.sh
$ jps

5696 SecondaryNameNode
5955 JobHistoryServer
5509 NameNode
5813 ResourceManager
5899 NodeManager
6683 HistoryServer
5597 DataNode
6702 Jps

## test
$ cat test.sh

#!/bin/sh

run() {
    bin/spark-submit \
        --master yarn \
        --deploy-mode cluster \
        --driver-memory 512m \
        --executor-memory 512m \
        --num-executors 1 \
        --class org.apache.spark.examples.SparkPi \
        examples/jars/spark-examples_2.12-3.2.1.jar 10
}

## main ##
run
appid=`grep "APPID" $HADOOP_HOME/logs/yarn*.log | tail -1 | awk 'pirnt $NF'`
appid=${appid#*APPID=}
echo $appid
$HADOOP_HOME/bin/yarn logs -applicationId $appid

$ test.sh

22/06/25 09:42:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
application_1656115668743_0003
22/06/25 09:43:05 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/25 09:43:06 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032


Container: container_1656115668743_0003_01_000001 on 192.168.124.7_52592
==========================================================================
LogType:stderr
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:379
Log Contents:
22/06/25 09:42:54 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
22/06/25 09:42:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/25 09:42:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
End of LogType:stderr

LogType:stdout
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:33
Log Contents:
Pi is roughly 3.1423911423911424
End of LogType:stdout



Container: container_1656115668743_0003_01_000002 on 192.168.124.7_52592
==========================================================================
LogType:stderr
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:379
Log Contents:
22/06/25 09:43:00 WARN Utils: Your hostname, sun-xo.local resolves to a loopback address: 127.0.0.1; using 192.168.124.7 instead (on interface en0)
22/06/25 09:43:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/06/25 09:43:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
End of LogType:stderr

LogType:stdout
Log Upload Time:Sat Jun 25 09:43:05 +0800 2022
LogLength:0
Log Contents:
End of LogType:stdout

Actually the output of program is "Pi is roughly 3.1423911423911424"
or you can see same result from http://localhost:8088/cluster -> appid -> logs

reference Overview - Spark 3.2.1 Documentation

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值