通过docker-compose安装spark

通过docker-compose安装spark

本文使用docker-compose启动spark(1master+2work),并将历史日志信息保存在hadoop上。

备注:本文是在单机器部署,如果多机器部署可以使用docker-compose up --scale spark-worker=3命令增加spark-worker的数量,进行滚动扩容。

1、Docker环境准备

本人是 Mac系统,Docker版本为:Docker version 24.0.6;docker-compose版本:v2.23.0-desktop.1。

关闭对Compose V2版本的支持:docker-compose disable-v2

docker-compose的Compose配置文件语法版本有3个版本,分别为1, 2.x 和 3.x。如果不关闭V2版本的话,运行docker-compose命令时会报错:unexpected character “-” in variable name near,执行docker-compose disable-v2解决该问题。

2、 安装spark:3.3.4
2.1 docker-compose.yaml创建
version: '3'
services:
  spark-master:
    image: docker.io/bitnami/spark:3.3.4
    hostname: spark-master
    user: "root:root"
    environment:
      - SPARK_MODE=master
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
      - SPARK_USER=spark
    volumes:
      - ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
    ports:
      - '8080:8080'
      - '4040:4040'
      - '7077:7077'
    networks:
      spark_network:
        ipv4_address: 172.22.0.100
    extra_hosts:
      - "spark-master:172.22.0.100"
      - "spark-worker1:172.22.0.101"
      - "spark-worker2:172.22.0.102"
  spark-worker-1:
    image: docker.io/bitnami/spark:3.3.4
    hostname: spark-worker1
    user: "root:root"
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark-master:7077
      - SPARK_WORKER_MEMORY=2G
      - SPARK_WORKER_CORES=2
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
      - SPARK_USER=spark
    volumes:
      - ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
    ports:
      - '8081:8081'
    networks:
      spark_network:
        ipv4_address: 172.22.0.101
    extra_hosts:
        - "spark-master:172.22.0.100"
        - "spark-worker1:172.22.0.101"
        - "spark-worker2:172.22.0.102"
  spark-worker-2:
    image: docker.io/bitnami/spark:3.3.4
    hostname: spark-worker2
    user: "root:root"
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark-master:7077
      - SPARK_WORKER_MEMORY=1G
      - SPARK_WORKER_CORES=2
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
      - SPARK_USER=spark
    volumes:
      - ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
    ports:
      - '8082:8081'
    networks:
      spark_network:
        ipv4_address: 172.22.0.102
    extra_hosts:
        - "spark-master:172.22.0.100"
        - "spark-worker1:172.22.0.101"
        - "spark-worker2:172.22.0.102"
  spark-history:
    image: docker.io/bitnami/spark:3.3.4
    hostname: spark-history
    restart: on-failure
    depends_on:
      - namenode
    user: "root:root"
    command: ["/opt/bitnami/spark/sbin/start-history-server.sh"]
    environment:
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
      - SPARK_USER=spark
    volumes:
      - ./spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf
    ports:
      - '18080:18080'
    networks:
      - spark_network
  namenode:
     image: apache/hadoop:3.3.5
     hostname: namenode
     command: ["hdfs", "namenode"]
     user: "root:root"
     ports:
       - 9870:9870
       - 8020:8020
     volumes:
       - spark_hadoop_nn_volume:/tmp/hadoop-root/dfs
     env_file:
       - ./hadoopconfig.env
     privileged: true
     environment:
         ENSURE_NAMENODE_DIR: "/tmp/hadoop-root/dfs/name"
     networks:
       - spark_network
  datanode:
     image: apache/hadoop:3.3.5
     hostname: datanode
     command: ["hdfs", "datanode"]
     user: "root:root"
     env_file:
       - ./hadoopconfig.env
     privileged: true
     ports:
       - 9864:9864
       - 9866:9866
     volumes:
       - spark_hadoop_dn_volume:/tmp/hadoop-root/dfs
     networks:
       - spark_network
networks:
  spark_network:
    driver: bridge
    ipam:
      config:
        - subnet: "172.22.0.0/24"
volumes:
  spark_hadoop_dn_volume:
  spark_hadoop_nn_volume:
2.2 环境变量文件hadoopconfig.env创建
CORE-SITE.XML_fs.default.name=hdfs://namenode
CORE-SITE.XML_fs.defaultFS=hdfs://namenode
CORE-SITE.XML_hadoop.http.staticuser.user=root
CORE-SITE.XML_hadoop.tmp.dir=/tmp/hadoop-root
HDFS-SITE.XML_dfs.namenode.rpc-address=namenode:8020
HDFS-SITE.XML_dfs.replication=1
MAPRED-SITE.XML_mapreduce.framework.name=yarn
MAPRED-SITE.XML_yarn.app.mapreduce.am.env=HADOOP_MAPRED_HOME=${HADOOP_HOME}
MAPRED-SITE.XML_mapreduce.map.env=HADOOP_MAPRED_HOME=${HADOOP_HOME}
MAPRED-SITE.XML_mapreduce.reduce.env=HADOOP_MAPRED_HOME=${HADOOP_HOME}
MAPRED-SITE.XML_mapreduce.jobhistory.address=0.0.0.0:10020
MAPRED-SITE.XML_mapreduce.jobhistory.webapp.address=0.0.0.0:19888
YARN-SITE.XML_yarn.resourcemanager.hostname=resourcemanager
YARN-SITE.XML_yarn.nodemanager.pmem-check-enabled=true
YARN-SITE.XML_yarn.nodemanager.delete.debug-delay-sec=600
YARN-SITE.XML_yarn.nodemanager.vmem-check-enabled=true
YARN-SITE.XML_yarn.nodemanager.aux-services=mapreduce_shuffle
YARN-SITE.XML_yarn.nodemanager.resource.cpu-vcores=4
YARN-SITE.XML_yarn.application.classpath=opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-applications=10000
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-am-resource-percent=0.1
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.resource-calculator=org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.queues=default
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.capacity=100
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.user-limit-factor=1
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.maximum-capacity=100
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.state=RUNNING
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_submit_applications=*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_administer_queue=*
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.node-locality-delay=40
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings=
CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings-override.enable=false
2.3 创建spark-defaults.conf
spark.eventLog.enabled       true
spark.eventLog.dir               hdfs://namenode:8020/shared/spark-logs
# 日志存储在hdfs上,需要设置为FsHistoryProvider
spark.history.provider          org.apache.spark.deploy.history.FsHistoryProvider
#指定历史服务器日志存储路径 ,这里存储在hdfs
spark.history.fs.logDirectory          hdfs://namenode:8020/shared/spark-logs
spark.history.ui.port             18080
#指定保存Application 历史记录的个数,如果超过这个值,旧的应用程序信息将被删除,这个是内存中的应用数,而不是页面上显示的应用数。
spark.history.retainedApplications=30
#是否周期性的清除日志
spark.history.fs.cleaner.enabled    true
#多长时间检查一次,应用日志是否清除
spark.history.fs.cleaner.interval      1d
#超过这个时间的日志会被清除掉
spark.history.fs.cleaner.maxAge     7d
2.4 创建后的目录如下
-rw-r--r--@ 1 taoruicheng  453037844   3.6K 12 26 13:22 docker-compose.yml
-rw-r--r--@ 1 taoruicheng  453037844   2.4K 12 25 14:59 hadoopconfig.env
-rw-r--r--@ 1 taoruicheng  453037844   905B 12 26 13:36 spark-defaults.conf
2.5 启动程序

docker-compose up -d

2.6 查看程序状态

docker-compose ps

NAME                     IMAGE                           COMMAND                   SERVICE          CREATED          STATUS          PORTS
spark-datanode-1         apache/hadoop:3.3.5             "/usr/local/bin/dumb…"   datanode         11 minutes ago   Up 11 minutes   0.0.0.0:9864->9864/tcp, 0.0.0.0:9866->9866/tcp
spark-namenode-1         apache/hadoop:3.3.5             "/usr/local/bin/dumb…"   namenode         11 minutes ago   Up 11 minutes   0.0.0.0:8020->8020/tcp, 0.0.0.0:9870->9870/tcp
spark-spark-history-1    docker.io/bitnami/spark:3.3.4   "/opt/bitnami/script…"   spark-history    11 minutes ago   Up 19 seconds   0.0.0.0:18080->18080/tcp
spark-spark-master-1     docker.io/bitnami/spark:3.3.4   "/opt/bitnami/script…"   spark-master     11 minutes ago   Up 11 minutes   0.0.0.0:4040->4040/tcp, 0.0.0.0:7077->7077/tcp, 0.0.0.0:8080->8080/tcp
spark-spark-worker-1-1   docker.io/bitnami/spark:3.3.4   "/opt/bitnami/script…"   spark-worker-1   11 minutes ago   Up 11 minutes   0.0.0.0:8081->8081/tcp
spark-spark-worker-2-1   docker.io/bitnami/spark:3.3.4   "/opt/bitnami/script…"   spark-worker-2   11 minutes ago   Up 11 minutes   0.0.0.0:8082->8081/tcp
3、 HDFS创建目录

访问 http://localhost:9870/explorer.html#/ 创建/shared/spark-logs目录

具体文章可查看:
https://blog.csdn.net/taoruicheng1/article/details/135114606

备注: 查看hdfs版本信息(进入/opt/bitnami/spark/jars 查看hadoop的jar包)

4、 运行spark example
1)进入  /opt/bitnami/spark/bin下
2) 执行: spark-submit --class org.apache.spark.examples.SparkPi  --master spark://spark-master:7077 /opt/bitnami/spark/examples/jars/spark-examples_2.12-3.3.4.jar 3
5、 查看页面
1)http://localhost:8080/
2)http://localhost:18080/

环境变量参考地址:https://spark.apache.org/docs/latest/configuration.html#environment-variables

docker-compose参考文档:https://github.com/bitnami/containers/blob/main/bitnami/spark/README.md

  • 12
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值