将scala和spark安装包上传
对scala和spark进行解压
配置环境变量
vim ~/.bashrc
#spark
export SPARK_HOME=/spark/spark-3.1.1-bin-without-hadoop
export PATH=${SPARK_HOME}/bin:$PATH#SCALA
export SCALA_HOME=/spark/scala-2.12.17
export PATH=${SCALA_HOME}/bin:$PATH
将环境变量文件传到其余节点
scp -r ~/.bashrc 192.168.233.102:~/.bashrc
scp -r ~/.bashrc 192.168.233.103:~/.bashrc
使环境变量生效
source ~/.bashrc(每个节点都要做)
配置文件
配置spark-env.sh
vim /spark/spark-3.1.1-bin-without-hadoop/conf/spark-env.sh
export JAVA_HOME=/java/java1.8
export SCALA_HOME=/spark/scala-2.12.17
export SPARK_WORKER_MEMORY=30G
export SPARK_WORKER_CORES=16
export SPARK_WORKER_INSTANCES=1
export SPARK_MASTER_IP=192.168.233.101
export SPARK_DIST_CLASSPATH=$(/hadoop/hadoop3.1.3/bin/hadoop classpath)
export HADOOP_CONF_DIR=/hadoop/hadoop3.1.3/etc/hadoop/
export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 "
/* SPARK_WORKER_MEMORY, 可参与计算的内存大小*/
/* SPARK_WORKER_CORES, 可参与计算的核心个数*/
/* SPARK_WORKER_INSTANCES, 该节点Worker实例的个数*/
/* SPARK_MASTER_IP, 主节点的IP地址*/
/* SPARK_DIST_CLASSPATH, 依赖包的路径,可设置多个路径*/
/* HADOOP_CONF_DIR, Hadoop配置文件路径*/
/* SPARK_HISTORY_OPTS, 历史作业相关的配置信息(实际上有很多配置项),spark.history.ui.portweb监控端口*/
配置workers
vim /spark/spark-3.1.1-bin-without-hadoop/conf/workers
远程拷贝
目的:为了使Spark集群中的每个计算任务能够直接访问HDFS,还需要将hadoop的配置文件hdfs-site.xml和core-site.xml拷贝到其余两个节点下
cd /hadoop/hadoop3.1.3//etc/hadoop/
scp -r hdfs-site.xml 192.168.233.102:/spark/spark-3.1.1-bin-without-hadoop/conf/
scp -r core-site.xml 192.168.233.102:/spark/spark-3.1.1-bin-without-hadoop/conf/
scp -r hdfs-site.xml 192.168.233.103:/spark/spark-3.1.1-bin-without-hadoop/conf/
scp -r core-site.xml 192.168.233.103:/spark/spark-3.1.1-bin-without-hadoop/conf/
将scala和spark拷贝到每个节点
scp -r /spark/spark-3.1.1-bin-without-hadoop 192.168.233.102:/spark/
scp -r /spark/scala-2.12.17 192.168.233.102:/spark/scp -r /spark/spark-3.1.1-bin-without-hadoop 192.168.233.103:/spark/
scp -r /spark/scala-2.12.17 192.168.233.103:/spark/
启动集群
start-dfs.sh
$SPARK_HOME/sbin/start-master.sh
$SPARK_HOME/sbin/start-workers.sh