spark安装和使用
1 安装scala
安装scala 2.11.8版本,spark-2.3.2版本,分别解压到该目录下,并重命名
[root@hadoop1 examples]# cd /usr/local/
[root@hadoop1 local]# ls
bin games hbase include lib64 nginx Python-3.8.0 sbin scala-2.11.8.tgz spark sqoop src
etc hadoop hive lib libexec python Python-3.8.0.tgz scala share spark-2.3.2-bin-hadoop2.7.tgz sqoop-1.4.4.bin__hadoop-2.0.4-alpha.tar.gz zookeeper
2 修改环境变量
修改环境变量,增加scala和spark
vim /etc/profile
export SCALA_HOME=/usr/local/scala
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$JAVA_HOME/bin:$JAVA_HOME/sbin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HBASE_HOME/bin:$HBASE_HOME/sbin:$HIVE_HOME/bin:$HIVE_HOME/sbin:$SQOOP_HOME/bin:$SCALA_HOME/bin:$SPARK_HOME/bin
source /etc/profile
3 修改spark配置
[root@hadoop1 spark]# cd /usr/local/spark/conf/
[root@hadoop1 conf]# ls
docker.properties.template log4j.properties.template slaves spark-defaults.conf spark-env.sh
fairscheduler.xml.template metrics.properties.template slaves.template spark-defaults.conf.template spark-env.sh.template
[root@hadoop1 conf]# cp slaves.template slaves
[root@hadoop1 conf]# cp spark-env.sh.template spark-env.sh
[root@hadoop1 conf]# cp spark-defaults.conf.template spark-defaults.conf
[root@hadoop1 conf]# cat spark-env.sh
#!/usr/bin/env bash
export JAVA_HOME=/usr/java/jdk1.8.0_271-amd64
export SCALA_HOME=/usr/local/scala
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_MASTER_HOST=hadoop1
export SPARK_PID_DIR=/usr/local/spark/data/pid
export SPARK_LOCAL_DIRS=/usr/local/spark/data/spark_shuffle
export SPARK_EXECUTOR_MEMORY=2G
export SPARK_WORKER_MEMORY=8G
[root@hadoop1 conf]#
[root@hadoop1 conf]# cat spark-defaults.conf
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.
# Example:
spark.master spark://hadoop1:7077
spark.eventLog.enabled true
spark.eventLog.dir hdfs://hadoop1:9000/eventLog
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 2g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
[root@hadoop1 conf]#
#增加计算节点
[root@hadoop1 conf]# cat slaves
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A Spark Worker will be started on each of the machines listed below.
hadoop2
hadoop3
[root@hadoop1 conf]#
#分发scala, spark分别到从机hadoop2,hadoop3,并修改/etc/profile文件
[root@hadoop1 conf]# scp /usr/local/scala hadoop2:/usr/local/
[root@hadoop1 conf]# scp /usr/local/scala hadoop3:/usr/local/
[root@hadoop1 conf]# scp /usr/local/spark hadoop2:/usr/local/
[root@hadoop1 conf]# scp /usr/local/spark hadoop3:/usr/local/
4 测试Spark
#启动hadoop
[root@hadoop1 conf]# start-all.sh
Starting namenodes on [hadoop1]
Last login: Tue Feb 2 08:55:23 CST 2021 from 10.2.33.165 on pts/0
hadoop1: namenode is running as process 19374. Stop it first.
Starting datanodes
Last login: Tue Feb 2 11:28:41 CST 2021 on pts/0
hadoop3: datanode is running as process 9056. Stop it first.
hadoop2: datanode is running as process 7838. Stop it first.
hadoop1: datanode is running as process 19541. Stop it first.
Starting secondary namenodes [hadoop1]
Last login: Tue Feb 2 11:28:42 CST 2021 on pts/0
hadoop1: secondarynamenode is running as process 19817. Stop it first.
Starting resourcemanager
Last login: Tue Feb 2 11:28:44 CST 2021 on pts/0
resourcemanager is running as process 20121. Stop it first.
Starting nodemanagers
Last login: Tue Feb 2 11:28:50 CST 2021 on pts/0
hadoop2: nodemanager is running as process 7992. Stop it first.
hadoop3: nodemanager is running as process 9205. Stop it first.
hadoop1: nodemanager is running as process 20294. Stop it first.
#启动spark
[root@hadoop1 conf]# /usr/local/spark/sbin/start-all.sh
org.apache.spark.deploy.master.Master running as process 31886. Stop it first.
hadoop3: org.apache.spark.deploy.worker.Worker running as process 28606. Stop it first.
hadoop2: org.apache.spark.deploy.worker.Worker running as process 25751. Stop it first.
[root@hadoop1 conf]#
#进入examples,查找计算PI程序
[root@hadoop1 conf]# cd /usr/local/spark/examples/src/main/
java/ python/ r/ resources/ scala/
[root@hadoop1 conf]# cd /usr/local/spark/examples/src/main/scala/org/apache/spark/examples/
[root@hadoop1 examples]# ls
BroadcastTest.scala graphx LocalFileLR.scala LogQuery.scala pythonconverters SparkHdfsLR.scala SparkPi.scala
DFSReadWriteTest.scala GroupByTest.scala LocalKMeans.scala ml SimpleSkewedGroupByTest.scala SparkKMeans.scala SparkTC.scala
DriverSubmissionTest.scala HdfsTest.scala LocalLR.scala mllib SkewedGroupByTest.scala SparkLR.scala sql
ExceptionHandlingTest.scala LocalALS.scala LocalPi.scala MultiBroadcastTest.scala SparkALS.scala SparkPageRank.scala streaming
[root@hadoop1 examples]# run-example S
SimpleSkewedGroupByTest.scala SparkALS.scala SparkKMeans.scala SparkPageRank.scala SparkTC.scala
SkewedGroupByTest.scala SparkHdfsLR.scala SparkLR.scala SparkPi.scala
#运行计算PI程序
[root@hadoop1 examples]# run-example SparkPi
2021-02-02 11:31:59 INFO DAGScheduler:54 - Job 0 finished: reduce at SparkPi.scala:38, took 2.880396 s
Pi is roughly 3.147035735178676
2021-02-02 11:31:59 INFO AbstractConnector:318 - Stopped Spark@7f4037ed{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
2021-02-02 11:31:59 INFO SparkUI:54 - Stopped Spark web UI at http://hadoop1:4040
2021-02-02 11:31:59 INFO StandaloneSchedulerBackend:54 - Shutting down all executors
2021-02-02 11:31:59 INFO CoarseGrainedSchedulerBackend$DriverEndpoint:54 - Asking each executor to shut down
2021-02-02 11:31:59 INFO MapOutputTrackerMasterEndpoint:54 - MapOutputTrackerMasterEndpoint stopped!
2021-02-02 11:32:00 INFO MemoryStore:54 - MemoryStore cleared
2021-02-02 11:32:00 INFO BlockManager:54 - BlockManager stopped
2021-02-02 11:32:00 INFO BlockManagerMaster:54 - BlockManagerMaster stopped
2021-02-02 11:32:00 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
2021-02-02 11:32:00 INFO SparkContext:54 - Successfully stopped SparkContext
2021-02-02 11:32:00 INFO ShutdownHookManager:54 - Shutdown hook called
2021-02-02 11:32:00 INFO ShutdownHookManager:54 - Deleting directory /usr/local/spark/data/spark_shuffle/spark-5b21001c-fbab-4ce1-afe7-a697dcffd8b9
2021-02-02 11:32:00 INFO ShutdownHookManager:54 - Deleting directory /tmp/spark-26692d5a-a342-4b02-aeae-5209ed6f90f0
[root@hadoop1 examples]#
hadoop安装参考:https://blog.csdn.net/zhaoyaxiong_ctu/article/details/113151938