安装包准备
下载包
2.3.4版本
wget https://archive.apache.org/dist/spark/spark-2.3.4/spark-2.3.4-bin-hadoop2.6.tgz
3.1.2版本
https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
tar xf spark-2.3.4-bin-hadoop2.6.tgz
mv spark-2.3.4-bin-hadoop2.6 bigdata/spark-2.3.4
vim /etc/profile
export SPARK_HOME=/opt/bigdata/spark-2.3.4
export PATH=$PATH:$SPARK_HOME/bin
Standalone部署Spark集群
修改默认配置
cd /opt/bigdata/spark-2.3.4/conf
for i in *.template; do mv ${i} ${i%.*};done
cd $SPARK_HOME/conf
echo -e "node02\nnode03\nnode04" > slaves
cp spark-env.sh.template spark-env.sh
sed -i '/HADOOP_CONF_DIR/aexport HADOOP_CONF_DIR=/opt/bigdata/hadoop-2.6.5/etc/hadoop' spark-env.sh
sed -i '/SPARK_MASTER_HOST/aexport SPARK_MASTER_HOST=node01' spark-env.sh
sed -i '/SPARK_MASTER_PORT/aexport SPARK_MASTER_PORT=7077' spark-env.sh
sed -i '/SPARK_MASTER_WEBUI_PORT/aexport SPARK_MASTER_WEBUI_PORT=8080' spark-env.sh
sed -i '/SPARK_WORKER_CORES/aexport SPARK_WORKER_CORES=4' spark-env.sh
sed -i '/SPARK_WORKER_MEMORY/aexport SPARK_WORKER_MEMORY=4g' spark-env.sh
scp -rp spark-2.3.4 node02:`pwd`
scp -rp spark-2.3.4 node03:`pwd`
scp -rp spark-2.3.4 node04:`pwd`
node02,node03,node04
vim /etc/profile
export SPARK_HOME=/opt/bigdata/spark-2.3.4
集群启动命令介绍
$SPARK_HOME/sbin/start-master.sh 启动一个master
$SPARK_HOME/sbin/start-slaves.sh 启动slaves根据 conf/slaves file.
$SPARK_HOME/sbin/start-slave.sh 启动一个slave
$SPARK_HOME/sbin/start-all.sh 启动一个master,多个slave
$SPARK_HOME/sbin/stop-master.sh
$SPARK_HOME/sbin/stop-slaves.sh
$SPARK_HOME/sbin/stop-all.sh
集群启动
$SPARK_HOME/sbin/start-all.sh
$SPARK_HOME/sbin/start-master.sh
验证
cat >wc <<-EOF
hadoop spark
spark hadoop
oracle mysql postgresql
postgresql oracle mysql
mysql mongodb
hdfs yarn mapreduce
yarn hdfs
zookeeper
EOF
hdfs dfs -mkdir /sparktest
hdfs dfs -put wc /sparktest/data.txt
hdfs dfs -cat /sparktest/data.txt
./spark-shell
$SPARK_HOME/bin/spark-shell --master spark://node01:7077
sc.textFile("hdfs://mycluster/sparktest/data.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
Standalone模式部署高可用Master集群
修改默认配置
cd /opt/bigdata/spark-2.3.4/conf
for i in *.template; do mv ${i} ${i%.*};done
cd $SPARK_HOME/conf
echo -e "node02\nnode03\nnode04" > slaves
export HADOOP_CONF_DIR=/opt/bigdata/hadoop-2.6.5/etc/hadoop
export SPARK_MASTER_HOST=node01
export SPARK_MASTER_PORT=7077
export SPARK_MASTER_WEBUI_PORT=8080
export SPARK_WORKER_CORES=4
export SPARK_WORKER_MEMORY=4g
spark.deploy.recoveryMode ZOOKEEPER
spark.deploy.zookeeper.url node02:2181,node03:2181,node04:2181
spark.deploy.zookeeper.dir /myspark
spark.eventLog.enabled true
spark.eventLog.dir hdfs://mycluster/spark_log
spark.history.fs.logDirectory hdfs://mycluster/spark_log
scp -rp spark-2.3.4 node02:`pwd`
scp -rp spark-2.3.4 node03:`pwd`
scp -rp spark-2.3.4 node04:`pwd`
node02,node03,node04
vim /etc/profile
export SPARK_HOME=/opt/bigdata/spark-2.3.4
分发配置,集群启动
分发配置、重启服务
在主控节点
$SPARK_HOME/sbin/start-all.sh
在node02 node02作为备master
$SPARK_HOME/sbin/start-master.sh
在node03 启动history
hdfs dfs -mkdir /spark_log
$SPARK_HOME/sbin/start-history-server.sh
http://node03:18080
验证集群
交互命令
./spark-shell
$SPARK_HOME/bin/spark-shell --master spark://node01:7077,node02:7077
执行
sc.textFile("hdfs://mycluster/sparktest/data.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
spark-submit提交程序
cd /opt/bigdata/spark-2.3.4/examples/jars
cluster模式
../../bin/spark-submit \
--master spark://node01:7077,node02:7077 \
--class org.apache.spark.examples.SparkPi \
--deploy-mode cluster \
--supervise \
--conf hello=world \
./spark-examples_2.11-2.3.4.jar \
1000
client模式
../../bin/spark-submit \
--master spark://node01:7077,node02:7077 \
--class org.apache.spark.examples.SparkPi \
--conf hello=world \
./spark-examples_2.11-2.3.4.jar \
1000
查看提交程序的帮助说明
spark-submit --help
基于yarn模式部署Spark集群
主要操作
YARN调度模式
1. 关掉 spark的master和worker
2. spark on yarn :不需要 master,worker的配置,rm -fr slaves
3. 只需要启动yarn的角色
在主控节点
start-yarn.sh
手动启动resourcemanager
[root@node03 ~]
[root@node04 ~]
配置文件
cd $SPARK_HOME/conf
rm -fr slaves
export HADOOP_CONF_DIR=/opt/bigdata/hadoop-2.6.5/etc/hadoop
spark.eventLog.enabled true
spark.eventLog.dir hdfs://mycluster/spark_log
spark.history.fs.logDirectory hdfs://mycluster/spark_log
spark.yarn.jars hdfs://mycluster/work/spark_lib/jars/*
scp spark-env.sh spark-defaults.conf node02:`pwd`
scp spark-env.sh spark-defaults.conf node03:`pwd`
scp spark-env.sh spark-defaults.conf node04:`pwd`
hadoop配置文件添加
yarn-site.xml
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4096</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>4</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
mapred-site.xml文件添加配置
<property>
<name>mapred.job.history.server.embedded</name>
<value>true</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>node03:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>node03:50060</value>
</property>
<property>
<name>mapreduce.jobhistory.intermediate-done-dir</name>
<value>/work/mr_history_tmp</value>
</property>
<property>
<name>mapreduce.jobhistory.done-dir</name>
<value>/work/mr-history_done</value>
</property>
然后在node03上启动historyserver服务
mr-jobhistory-daemon.sh start historyserver
分发配置文件
cd /opt/bigdata/hadoop-2.6.5/etc/hadoop
scp yarn-site.xml mapred-site.xml node02:`pwd`
scp yarn-site.xml mapred-site.xml node03:`pwd`
scp yarn-site.xml mapred-site.xml node04:`pwd`
启动yarn
在node01主控节点
start-yarn.sh
启动resourcemanager
node03
node04
访问 http://node03:8088/cluster
启动map的history服务 在node03
mr-jobhistory-daemon.sh start historyserver
启动spark
cd $SPARK_HOME/
$SPARK_HOME//bin/spark-shell --master yarn 启动慢
启动后再集群中会出现以下服务
1. SparkSubmit
2. CoarseGrainedExecutorBackend
3. ExecutorLauncher
会发现 http://node03:8088/cluster 有SPARK
验证
执行
sc.textFile("hdfs://mycluster/sparktest/data.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
解决spark-shell启动慢
spark-default.conf配置文件
spark.yarn.jars hdfs://mycluster/work/spark_lib/jars/*
cd $SPARK_HOME/
hdfs dfs -put jars /work/spark_lib/