1.简版
--master yarn-cluster \
--jars mysql-connector-java-8.0.28.jar,sparklens-0.3.2-s_2.11.jar \
--driver-class-path mysql-connector-java-8.0.28.jar \
--conf spark.executor.extraClassPath=sparklens-0.3.2-s_2.11.jar \
--conf spark.extraListeners=com.qubole.sparklens.QuboleJobListener \
--conf spark.sparklens.reporting.disabled=true \
--conf spark.sparklens.data.dir=/projects/sparklens \
--conf spark.default.parallelism=1900 \
--conf spark.sql.shuffle.partitions=1900 \
--conf spark.dynamicAllocation.maxExecutors=250 \
--conf spark.dynamicAllocation.executorIdleTimeout=120s \
--conf spark.dynamicAllocation.initialExecutors=200 \
--conf spark.dynamicAllocation.minExecutors=200 \
--conf spark.executor.cores=3 \
--conf spark.executor.memory=512m \
--conf spark.executor.memoryOverhead=1g \
--conf spark.blacklist.enabled=true \
--conf spark.yarn.nodemanager.vmem-check-enabled=false \
--conf spark.yarn.nodemanager.pmem-check-enabled=false \
--conf spark.speculation.quantile=0.5 \
--conf spark.speculation.multiplier=1.4 \
--conf spark.yarn.dist.files=hdfs://projects/log4j2.properties#log4j2.propertie \
--conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:log4j2.propertie" \
--conf "spark.executor.extraJavaOptions=-Dlog4j.configuration=file:log4j2.propertie" \
2.详细注释版(spark 2.4)
# 给下面的 Spark 启动配置加上注释
--master yarn-cluster \ # 设置 Spark 的运行模式为 yarn-cluster,即在 YARN 集群中运行应用程序
--jars mysql-connector-java-8.0.28.jar,sparklens-0.3.2-s_2.11.jar \ # 添加外部的 JAR 包 mysql-connector-java-8.0.28.jar 和 sparklens-0.3.2-s_2.11.jar
--driver-class-path mysql-connector-java-8.0.28.jar \ # 设置 Driver 运行时的 classpath,将 mysql-connector-java-8.0.28.jar 添加到 classpath 中
--conf spark.executor.extraClassPath=sparklens-0.3.2-s_2.11.jar \ # 设置 Executor 运行时的 classpath,将 sparklens-0.3.2-s_2.11.jar 添加到 classpath 中
--conf spark.extraListeners=com.qubole.sparklens.QuboleJobListener \ # 启用 QuboleJobListener,用于监控 Spark 作业的性能指标
--conf spark.sparklens.reporting.disabled=true \ # 禁用 SparkLens 的报告功能,避免浪费资源
--conf spark.sparklens.data.dir=/projects/sparklens \ # 指定 SparkLens 存储数据的目录
--conf spark.default.parallelism=1900 \ # 设置默认并行度,即每个任务的分区数,默认值为核数 * 2
--conf spark.sql.shuffle.partitions=1900 \ # 设置 Shuffle 操作的分区数,默认值为核数 * 200
--conf spark.dynamicAllocation.maxExecutors=250 \ # 设置动态分配的最大 Executor 数量
--conf spark.dynamicAllocation.executorIdleTimeout=120s \ # 设置 Executor 空闲时间,超过该时间后将被回收
--conf spark.dynamicAllocation.initialExecutors=200 \ # 设置动态分配的初始 Executor 数量
--conf spark.dynamicAllocation.minExecutors=200 \ # 设置动态分配的最小 Executor 数量
--conf spark.executor.cores=3 \ # 设置每个 Executor 使用的 CPU 核数
--conf spark.executor.memory=512m \ # 设置每个 Executor 的内存大小
--conf spark.executor.memoryOverhead=1g \ # 设置 Executor 的内存 overhead
--conf spark.blacklist.enabled=true \ # 启用 Executor 黑名单功能,避免因 Executor 运行异常导致的作业失败
--conf spark.yarn.nodemanager.vmem-check-enabled=false \ # 禁用虚拟内存检查,避免因虚拟内存检查导致的作业失败
--conf spark.yarn.nodemanager.pmem-check-enabled=false \ # 禁用物理内存检查,避免因物理内存检查导致的作业失败
--conf spark.speculation.quantile=0.5 \ # 设置任务推测执行的百分比
--conf spark.speculation.multiplier=1.4 \ # 设置任务推测执行的倍数
--conf spark.yarn.dist.files=hdfs://projects/log4j2.properties#log4j2.propertie \ # 将日志配置文件log4j2.properties添加到Spark应用程序的classpath
--conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:log4j2.propertie" \ # 配置Spark驱动程序(Driver)的日志级别等参数,将log4j2.properties文件作为驱动程序(Driver)的日志配置文件
--conf "spark.executor.extraJavaOptions=-Dlog4j.configuration=file:log4j2.propertie" \ # 配置Spark Executor的日志级别等参数,将log4j2.properties文件作为Executor的日志配置文件
3.详细注释版(spark 3.X)
--master yarn-cluster \
--jars mysql-connector-java-8.0.28.jar \
--driver-class-path mysql-connector-java-8.0.28.jar \
--conf spark.default.parallelism=5000 \
--conf spark.sql.shuffle.partitions=5000 \
--conf spark.dynamicAllocation.maxExecutors=1000 \
--conf spark.dynamicAllocation.executorIdleTimeout=120s \
--conf spark.dynamicAllocation.initialExecutors=100 \
--conf spark.dynamicAllocation.minExecutors=100 \
--conf spark.executor.cores=1 \
--conf spark.executor.memory=10g \
--conf spark.executor.memoryOverhead=1g \
--conf spark.shuffle.memoryFraction=0.2 \
--conf spark.excludeOnFailure.enabled=true \
--conf spark.yarn.nodemanager.vmem-check-enabled=false \
--conf spark.yarn.nodemanager.pmem-check-enabled=false \
--conf spark.speculation.quantile=0.5 \
--conf spark.speculation.multiplier=1.4 \
--conf spark.rss.enabled=true \
--conf spark.shuffle.io.clientThreads=16 \
--conf spark.yarn.dist.files=hdfs://R2/a.conf#aa.conf,hdfs://R2/a/log4j.properties#log4j-first.properties \
--conf "spark.driver.extraJavaOptions=-Dlog4j.configuration=file:log4j-first.properties" \
--conf "spark.executor.extraJavaOptions=-Dlog4j.configuration=file:log4j-first.properties" \