1.下载scala-2.12.8.tgz和spark-2.4.3-bin-hadoop2.7.tgz
2.将安装包移动到usr目录下
[root@master bigdata]# mv scala-2.12.8.tgz spark-2.4.3-bin-hadoop2.7.tgz /usr
3.解压
tar -zxvf scala-2.12.8.tgz
4.把scala-2.12.8重命名为
mv scala-2.12.8 scala
5.配置环境变量
[root@master usr]# vi /etc/profile
export SCALA_HOME=/usr/scala
export PATH=$PATH:$SCALA_HOME/bin
[root@master usr]# source /etc/profile //使配置生效
[root@master usr]# echo $PATH //检查路径
6.把scala复制到其他节点上,slave1和slave2
[root@master usr]# scala -version
Scala code runner version 2.12.8 -- Copyright 2002-2018, LAMP/EPFL and Lightbend, Inc.
[root@master usr]# scp scala slave1:/usr
scala: not a regular file
[root@master usr]# scp -r scala slave1:/usr
7.在slave1节点上配置环境变量,并复制到slave2节点上,并在slave2节点上执行 sourc/etc/profile
export SCALA_HOME=/usr/scala
export PATH=$PATH:$SCALA_HOME/bin
:wq
[root@slave1 usr]# source /etc/profile
[root@slave1 usr]# scp -r /etc/profile slave2:/etc
profile 100% 2140 1.2MB/s 00:00
[root@slave1 usr]#
8.安装spark,解压后
[root@master usr]# mv spark-2.4.3-bin-hadoop2.7 spark
[root@master usr]# cd spark
[root@master spark]# ls
bin data jars LICENSE NOTICE R RELEASE yarn
conf examples kubernetes licenses python README.md sbin
[root@master spark]# cd conf
[root@master conf]# ls
docker.properties.template metrics.properties.template spark-env.sh.template
fairscheduler.xml.template slaves.template
log4j.properties.template spark-defaults.conf.template
[root@master conf]# cp spark-env.sh.template spark-env.sh
[root@master conf]# vi spark-env.sh
9.配置spark文件
[root@master usr]# cd java
[root@master java]# ls
jdk jdk-8u201-linux-x64.tar.gz
[root@master java]# cd ..
[root@master usr]# cd spark
[root@master spark]# cd conf/spark-env.sh
bash: cd: conf/spark-env.sh: Not a directory
[root@master spark]# cd conf
[root@master conf]# vi spark-env.sh
JAVA_HOME=/usr/java/jdk
SCALA_HOME=/usr/scala
HADOOP_CONF_DIR=/usr/hadoop/etc/hadoop
SPARK_LOCAL_IP=master
SPARK_LOG_DIR=/bigdata/logs/spark
SPARK_PID_DIR=/bigdata/pid/spark
SPARK_LOCAL_DIRS=/bigdata/tmp/spark
9.修改slaves文件
[root@master conf]# cp slaves.template slaves
[root@master conf]# vi slaves
# A Spark Worker will be started on each of the machines listed below.
master
slave1
slave2
10.复制到其他节点上,并修改各自的SPARK_LOCAL_IP项修改为所在节点的hostname或IP
[root@master usr]# scp -r /usr/spark slave1:/usr
[root@slave1 conf]# vi spark-env.sh
JAVA_HOME=/usr/java/jdk
SCALA_HOME=/usr/scala
HADOOP_CONF_DIR=/usr/hadoop/etc/hadoop
SPARK_LOCAL_IP=slave1
SPARK_LOG_DIR=/username/logs/spark
SPARK_PID_DIR=/username/pid/spark
SPARK_LOCAL_DIRS=/username/tmp/spark
11.配置spark环境变量
[root@master usr]# vi /etc/profile
export SPARK_HOME=/usr/spark
export PATH=$PATH:$SPARK_HOME/bin
[root@slave1 conf]# cd /usr
[root@slave1 usr]# vi /etc/profile
[root@slave1 usr]# source /etc/profile
[root@slave1 usr]# scp -r /etc/profile slave2:/etc
profile 100% 2203 1.0MB/s 00:00
[root@slave1 usr]#
12.查看是否配置成功
[root@master usr]# saprk-shell version
bash: saprk-shell: command not found
[root@master usr]# spark-shell -version
19/06/05 07:45:20 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at http://master:4040
Spark context available as 'sc' (master = local[*], app id = local-1559691955517).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.3
/_/
Using Scala version 2.11.12 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_201)
Type in expressions to have them evaluated.
Type :help for more information.
scala>
14.在master节点上启动spark集群
[root@master usr]# cd ..
[root@master /]# /usr/spark/sbin/start-all.sh
starting org.apache.spark.deploy.master.Master, logging to /wuxiaoli/logs/spark/spark-wuxiaoli-org.apache.spark.deploy.master.Master-1-master.out
master: starting org.apache.s······
或
[root@master spark]# cd sbin
[root@master sbin]# ./start-all.sh
15.通过web界面查看,在浏览器中输入 192.168.79.11:8080
17.查看logs文件,直接在sbin目录进入logs文件
[root@master sbin]# cd /abc
[root@master abc]# ls
logs pid tmp
[root@master abc]# cd ..
[root@master /]# ls
bigdata boot etc lib media opt root sbin sys usr abc
bin dev home lib64 mnt proc run srv tmp var
[root@master /]# cd abc
[root@master abc]# ls
logs pid tmp
18.Spark Shell,读取root目录下hiveTestData2.txt的行数
[root@master spark]# spark-shell --master spark://master:7077
19/06/05 23:04:13 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
scala> val theFile = spark.read.textFile("file:///root/hiveTestData2.txt")
theFile: org.apache.spark.sql.Dataset[String] = [value: string]
scala> theFile.count()
res3: Long = 199
scala> theFile.first()
res4: String = 20111230111529 90dad1d0612387afb6998415bdc10349 �й������̷���ϸ���� http://wenwen.soso.com/z/q343414009.htm
scala> :quit
19.有乱码解决乱码问题,分别执行以下两个命令
[root@master scala]# vi `which scalac`
#!/usr/bin/env bash
[root@master usr]# vi `which scala`
#!/usr/bin/env bash
找到
[ -n "$JAVA_OPTS" ] || JAVA_OPTS="-Xmx256M -Xms32M"
替换为
[ -n "$JAVA_OPTS" ] || JAVA_OPTS="-Xmx256M -Xms32M -Dfile.encoding=UTF-8"
仍然为乱码
scala> theFile.first()
res2: String = 20111230111529 90dad1d0612387afb6998415bdc10349 �й������̷���ϸ���� http://wenwen.soso.com/z/q343414009.htm