通过docker-compose安装spark
本文使用docker-compose启动spark(1master+2work),并将历史日志信息保存在hadoop上。
备注:本文是在单机器部署,如果多机器部署可以使用docker-compose up --scale spark-worker=3命令增加spark-worker的数量,进行滚动扩容。
1、Docker环境准备
本人是 Mac系统,Docker版本为:Docker version 24.0.6;docker-compose版本:v2.23.0-desktop.1。
关闭对Compose V2版本的支持:docker-compose disable-v2
docker-compose的Compose配置文件语法版本有3个版本,分别为1, 2.x 和 3.x。如果不关闭V2版本的话,运行docker-compose命令时会报错:unexpected character “-” in variable name near,执行docker-compose disable-v2解决该问题。
2、 安装spark:3.3.4
2.1 docker-compose.yaml创建
version: '3'
services:
spark-master:
image: docker.io/bitnami/spark:3.3.4
hostname: spark-master
user: "root:root"
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- SPARK_USER=spark
volumes:
- ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
ports:
- '8080:8080'
- '4040:4040'
- '7077:7077'
networks:
spark_network:
ipv4_address: 172.22.0.100
extra_hosts:
- "spark-master:172.22.0.100"
- "spark-worker1:172.22.0.101"
- "spark-worker2:172.22.0.102"
spark-worker-1:
image: docker.io/bitnami/spark:3.3.4
hostname: spark-worker1
user: "root:root"
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=2G
- SPARK_WORKER_CORES=2
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- SPARK_USER=spark
volumes:
- ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
ports:
- '8081:8081'
networks:
spark_network:
ipv4_address: 172.22.0.101
extra_hosts:
- "spark-master:172.22.0.100"
- "spark-worker1:172.22.0.101"
- "spark-worker2:172.22.0.102"
spark-worker-2:
image: docker.io/bitnami/spark:3.3.4
hostname: spark-worker2
user: "root:root"
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=2
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- SPARK_USER=spark
volumes:
- ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
ports:
- '8082:8081'
networks:
spark_network:
ipv4_address: 172.22.0.102
extra_hosts:
- "spark-master:172.22.0.100"
- "spark-worker1:172.22.0.101"
- "spark-worker2:172.22.0.102"
spark-history:
image: docker.io/bitnami/spark:3.3.4
hostname: spark-history
restart: on-failure
depends_on:
- namenode
user: "root:root"
command: ["/opt/bitnami/spark/sbin/start-history-server.sh"]
environment:
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- SPARK_USER=spark
volumes:
- ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
ports:
- '18080:18080'
networks:
- spark_network
namenode:
image: apache/hadoop:3.3.5
hostname: namenode
command: ["hdfs", "namenode"]
user: "root:root"
ports:
- 9870:9870
- 8020:8020
volumes:
- spark_hadoop_nn_volume:/tmp/hadoop-root/dfs
env_file:
- ./hadoopconfig.env
privileged: true
environment:
ENSURE_NAMENODE_DIR: "/tmp/hadoop-root/dfs/name"
networks:
- spark_network
datanode:
image: apache/hadoop:3.3.5
hostname: datanode
command: ["hdfs", "datanode"]
user: "root:root"
env_file:
- ./hadoopconfig.env
privileged: true
ports:
- 9864:9864
- 9866:9866
volumes:
- spark_hadoop_dn_volume:/tmp/hadoop-root/dfs
networks:
- spark_network
networks:
spark_network:
driver: bridge
ipam:
config:
- subnet: "172.22.0.0/24"
volumes:
spark_hadoop_dn_volume:
spark_hadoop_nn_volume:
2.2 环境变量文件hadoopconfig.env创建
CORE-SITE.XML_fs.default.name=hdfs://namenode
CORE-SITE.XML_fs.defaultFS=hdfs://namenode
CORE-SITE.XML_hadoop.http.staticuser.user=root
CORE-SITE.XML_hadoop.tmp.dir=/tmp/hadoop-root
HDFS-SITE.XML_dfs.namenode.rpc-address=namenode:8020
HDFS-SITE.XML_dfs.replication=1
MAPRED-SITE.XML_mapreduce.framework.name=yarn
MAPRED-SITE.XML_yarn.app.mapreduce.am.env=HADOOP_MAPRED_HOME=${HADOOP_HOME}
MAPRED-SITE.XML_mapreduce.map.env=HADOOP_MAPRED_HOME=${HADOOP_HOME}
MAPRED-SITE.XML_mapreduce.reduce.env=HADOOP_MAPRED_HOME=${HADOOP_HOME}
MAPRED-SITE.XML_mapreduce.jobhistory.address=0.0.0.0:10020
MAPRED-SITE.XML_mapreduce.jobhistory.webapp.address=0.0.0.0:19888
YARN-SITE.XML_yarn.resourcemanager.hostname=resourcemanager
YARN-SITE.XML_yarn.nodemanager.pmem-check-enabled=true
YARN-SITE.XML_yarn.nodemanager.delete.debug-delay-sec=600
YARN-SITE.XML_yarn.nodemanager.vmem-check-enabled=true
YARN-SITE.XML_yarn.nodemanager.aux-services=mapreduce_shuffle
YARN-SITE.XML_yarn.nodemanager.resource.cpu-vcores=4
YARN-SITE.XML_yarn.application.classpath=opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-applications=10000
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-am-resource-percent=0.1
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.resource-calculator=org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.queues=default
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.capacity=100
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.user-limit-factor=1
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.maximum-capacity=100
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.state=RUNNING
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_submit_applications=*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_administer_queue=*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.node-locality-delay=40
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings=
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings-override.enable=false
2.3 创建spark-defaults.conf
spark.eventLog.enabled true
spark.eventLog.dir hdfs://namenode:8020/shared/spark-logs
# 日志存储在hdfs上,需要设置为FsHistoryProvider
spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider
#指定历史服务器日志存储路径 ,这里存储在hdfs
spark.history.fs.logDirectory hdfs://namenode:8020/shared/spark-logs
spark.history.ui.port 18080
#指定保存Application 历史记录的个数,如果超过这个值,旧的应用程序信息将被删除,这个是内存中的应用数,而不是页面上显示的应用数。
spark.history.retainedApplications=30
#是否周期性的清除日志
spark.history.fs.cleaner.enabled true
#多长时间检查一次,应用日志是否清除
spark.history.fs.cleaner.interval 1d
#超过这个时间的日志会被清除掉
spark.history.fs.cleaner.maxAge 7d
2.4 创建后的目录如下
-rw-r--r--@ 1 taoruicheng 453037844 3.6K 12 26 13:22 docker-compose.yml
-rw-r--r--@ 1 taoruicheng 453037844 2.4K 12 25 14:59 hadoopconfig.env
-rw-r--r--@ 1 taoruicheng 453037844 905B 12 26 13:36 spark-defaults.conf
2.5 启动程序
docker-compose up -d
2.6 查看程序状态
docker-compose ps
NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS
spark-datanode-1 apache/hadoop:3.3.5 "/usr/local/bin/dumb…" datanode 11 minutes ago Up 11 minutes 0.0.0.0:9864->9864/tcp, 0.0.0.0:9866->9866/tcp
spark-namenode-1 apache/hadoop:3.3.5 "/usr/local/bin/dumb…" namenode 11 minutes ago Up 11 minutes 0.0.0.0:8020->8020/tcp, 0.0.0.0:9870->9870/tcp
spark-spark-history-1 docker.io/bitnami/spark:3.3.4 "/opt/bitnami/script…" spark-history 11 minutes ago Up 19 seconds 0.0.0.0:18080->18080/tcp
spark-spark-master-1 docker.io/bitnami/spark:3.3.4 "/opt/bitnami/script…" spark-master 11 minutes ago Up 11 minutes 0.0.0.0:4040->4040/tcp, 0.0.0.0:7077->7077/tcp, 0.0.0.0:8080->8080/tcp
spark-spark-worker-1-1 docker.io/bitnami/spark:3.3.4 "/opt/bitnami/script…" spark-worker-1 11 minutes ago Up 11 minutes 0.0.0.0:8081->8081/tcp
spark-spark-worker-2-1 docker.io/bitnami/spark:3.3.4 "/opt/bitnami/script…" spark-worker-2 11 minutes ago Up 11 minutes 0.0.0.0:8082->8081/tcp
3、 HDFS创建目录
访问 http://localhost:9870/explorer.html#/ 创建/shared/spark-logs目录
具体文章可查看:
https://blog.csdn.net/taoruicheng1/article/details/135114606
备注: 查看hdfs版本信息(进入/opt/bitnami/spark/jars 查看hadoop的jar包)
4、 运行spark example
1)进入 /opt/bitnami/spark/bin下
2) 执行: spark-submit --class org.apache.spark.examples.SparkPi --master spark://spark-master:7077 /opt/bitnami/spark/examples/jars/spark-examples_2.12-3.3.4.jar 3
5、 查看页面
1)http://localhost:8080/
2)http://localhost:18080/
环境变量参考地址:https://spark.apache.org/docs/latest/configuration.html#environment-variables
docker-compose参考文档:https://github.com/bitnami/containers/blob/main/bitnami/spark/README.md