spark安装
docker-compose文件
version: "2.2"
services:
namenode:
image: bde2020/hadoop-namenode:1.1.0-hadoop2.7.1-java8
container_name: namenode
volumes:
- hadoop_namenode:/hadoop/dfs/name
- ./input_files:/input_files
environment:
- CLUSTER_NAME=test
env_file:
- ./hadoop.env
ports:
- 50070:50070
resourcemanager:
image: bde2020/hadoop-resourcemanager:1.1.0-hadoop2.7.1-java8
container_name: resourcemanager
depends_on:
- namenode
- datanode1
- datanode2
env_file:
- ./hadoop.env
historyserver:
image: bde2020/hadoop-historyserver:1.1.0-hadoop2.7.1-java8
container_name: historyserver
depends_on:
- namenode
- datanode1
- datanode2
volumes:
- hadoop_historyserver:/hadoop/yarn/timeline
env_file:
- ./hadoop.env
nodemanager1:
image: bde2020/hadoop-nodemanager:1.1.0-hadoop2.7.1-java8
container_name: nodemanager1
depends_on:
- namenode
- datanode1
- datanode2
env_file:
- ./hadoop.env
datanode1:
image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
container_name: datanode1
depends_on:
- namenode
volumes:
- hadoop_datanode1:/hadoop/dfs/data
env_file:
- ./hadoop.env
datanode2:
image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
container_name: datanode2
depends_on:
- namenode
volumes:
- hadoop_datanode2:/hadoop/dfs/data
env_file:
- ./hadoop.env
datanode3:
image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
container_name: datanode3
depends_on:
- namenode
volumes:
- hadoop_datanode3:/hadoop/dfs/data
env_file:
- ./hadoop.env
master:
image: gettyimages/spark:2.3.0-hadoop-2.8
container_name: master
command: bin/spark-class org.apache.spark.deploy.master.Master -h master
hostname: master
environment:
MASTER: spark://master:7077
SPARK_CONF_DIR: /conf
SPARK_PUBLIC_DNS: localhost
links:
- namenode
expose:
- 7001
- 7002
- 7003
- 7004
- 7005
- 7077
- 6066
ports:
- 4040:4040
- 6066:6066
- 7077:7077
- 8080:8080
volumes:
- ./conf/master:/conf
- ./data:/tmp/data
- ./jars:/root/jars
worker:
image: gettyimages/spark:2.3.0-hadoop-2.8
container_name: worker
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
hostname: worker
environment:
SPARK_CONF_DIR: /conf
SPARK_WORKER_CORES: 2
SPARK_WORKER_MEMORY: 1g
SPARK_WORKER_PORT: 8881
SPARK_WORKER_WEBUI_PORT: 8081
SPARK_PUBLIC_DNS: localhost
links:
- master
expose:
- 7012
- 7013
- 7014
- 7015
- 8881
ports:
- 8081:8081
volumes:
- ./conf/worker:/conf
- ./data:/tmp/data
volumes:
hadoop_namenode:
hadoop_datanode1:
hadoop_datanode2:
hadoop_datanode3:
hadoop_historyserver:
新建目录
mkdir /opt/spark
mv docker-compose.yml /opt/spark
启动spark
docker-compose start
docker-compose ps
查看启动状态
docker logs -f master
查看任务状态
http://192.168.147.140:8080/ 宿主主机IP + 端口
spark测试
运行内部实例
测试spark自带的例子
docker exec master spark-submit \
--master spark://master:7077 \
--class org.apache.spark.examples.SparkPi \
/usr/spark-2.3.0/examples/jars/spark-examples_2.11-2.3.0.jar 1000
Demo
引入Maven
编写WordCount
打包部署
mvn clean package -Dmaven.test.skip=true
更新jar包
docker cp jars/worldcount-1.0-SNAPSHOT.jar master:/usr/spark-2.3.0/jars/worldcount-1.0-SNAPSHOT.jar
提交任务
docker exec master spark-submit \
--master spark://master:7077 \
--class com.ibdsr.WordCount \
--executor-memory 512m \
--total-executor-cores 2 \
/usr/spark-2.3.0/jars/worldcount-1.0-SNAPSHOT.jar \
test.txt
上传文件
验证结果