SCALA安装
下载好scala 2.12.解压到/usr/local/scala
然后添加环境变量
export SCALA_HOME=/usr/local/scala
export PATH=$PATH:$SCALA_HOME/bin
安装SPARK
- 下载好spark2.4
- 解压到
/usr/local/spark
- 添加环境变量
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SCALA_HOME/bin
- 配置
cd /usr/local/spark/conf
cp log4j.properties.template log4j.properties
sudo gedit log4j.properties
将log4j.rootCategory=INFO
改为WARN.
- 本地运行SPARK
pyspark --master local[4]
## 进入之后输入
sc.master
- 在Hadoop YARN运行pyspark
HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop pyspark --master yarn --deploy-mode client
sc.master
- 查看Web
http://localhost:8088/
构建Spark Standalone Cluster运行环境
- 配置spark-env.sh
cp /usr/local/spark/conf/spark-env.sh.template /usr/local/spark/conf/spark-env.sh
sudo gedit /usr/local/spark/conf/spark-env.sh
export SPARK_MASTER_IP=master
export SPARK_WORKER_CORES=1
export SPARK_WORKER_MEMORY=2G
export SPARK_WORKER_INSTANCES=4
将master的spark程序复制到data1,data2,data3.
ssh data1
sudo mkdir /usr/local/spark
sudo chown hadoop:hadoop /usr/local/spark
exit
sudo scp -r /usr/local/spark hadoop@data1:/usr/local
- 编辑salves文件
sudo gedit /usr/local/spark/conf/slaves
#输入
data1
data2
data3
- 编辑masters文件
sudo gedit /usr/local/spark/conf/masters
#输入
master
在Spark Standalone 运行pyspark
- 启动spark standalone cluster
/usr/local/spark/sbin/start-all.sh
- 在spark standalone cluster上运行pyspark
pyspark --master spark://master:7077 --num-executors 1 --total-executor-cores 3 --executor-memory 1g
sc.master
- 查看web
http://master:8080/