1.application.conf
# sparkConf配置
spark.worker.timeout="500" # 1.spark的work超时时间
spark.rpc.askTimeout="600s" # 2.rpc问答超时时间
spark.network.timeout="600s" # 3. network超时时间
spark.cores.max = "10" # 4.可以用的最大cores
spark.task.maxFailures="5" # 5.task任务失败最大重试次数
spark.speculation="true" # 6.开启推测执行
spark.speculation.interval="300s" # 推测执行间隔300秒
# 推测执行分为值,当task任务执行达到90%,我们才会上推测执行,把task任务移动到其他节点上去
spark.speculation.quantile="0.9"
spark.driver.allowMutilpleContext="true" # 7.是否允许有多个driver并行执行
spark.serializer="org.apache.spark.serializer.KryoSerializer" # 8.序列化
spark.buffer.pageSize="200m" # 9.缓存页
spark.sql.adaptive.enabled="true" # 10.sql自适应
spark.streaming.checkpointdir="hdfs://node01:9000/streaming" # 11.检查点存放位置
spark.streaming.kafka.maxRatePerPartition="1000" # 12.限流
spark.streaming.backpressure.enabled="true" # 13.开启压背
spark.streaming.backpressur.initialRate="10" # 14.初始化压背
spark.streaming.concurrentJobs=5 # 15.并行度
# sparkSession配置
spark.scheduler.mode="FAIR" # 1.工程调度模式:公平调度器FAIR,先进先出调度器FIFO,容量调度器
spark.executor.memoryOverhead="512" # 2.堆外内存
2.添加maven依赖
<dependency>
<!-- 就个就是实现scala读取resources配置文件application.conf所要使用的依赖jar包-->
<groupId>com.typesafe</groupId>
<artifactId>config</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<!--sparkSQL依赖-->
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
<!--<version>${spark.version}</version>-->
</dependency>
<dependency>
<!--sparkStreaming依赖-->
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.0</version>
</dependency>
3.读取配置文件工具类
package com.cartravel.common.readApplicationconfUtil
import com.typesafe.config.{Config, ConfigFactory}
class readApplicatuinFileUtil {
private val config: Config = ConfigFactory.load()
def getConf:String=>String=(args:String)=>config.getString(args)
}
object readApplicatuinFileUtil extends readApplicatuinFileUtil
4.整合上述的application和读取配置文件工具类,创建sparkConf,以及saprkSesson,streamingContext的工具类
package com.cartravel.sparkInitialization
import com.cartravel.bean.{DriverInfo, Opt_alliance_business, OrderInfo, RegisterUsers}
import com.cartravel.readApplicationconfUtil.readApplicatuinFileUtil
import org.apache.spark
import org.apache.spark.{SparkConf, SparkContext, sql}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
object sparkInitialization {
//================================== sparkConf ===========================
def getSparkConf: SparkConf ={
val sparkConf: SparkConf = new SparkConf()
.set("spark.worker.timeout",readApplicatuinFileUtil.getConf("spark.worker.timeout"))
.set("spark.rpc.askTimeout",readApplicatuinFileUtil.getConf("spark.rpc.askTimeout"))
.set("spark.network.timeout",readApplicatuinFileUtil.getConf("spark.network.timeout"))
.set("spark.cores.max",readApplicatuinFileUtil.getConf("spark.cores.max"))
.set("spark.task.maxFailures",readApplicatuinFileUtil.getConf("spark.task.maxFailures"))
.set("spark.speculation",readApplicatuinFileUtil.getConf("saprk.speculation"))
.set("spark.sqeculation.interval",readApplicatuinFileUtil.getConf("spark.speculation.interval"))
.set("saprk.speculation.quantile",readApplicatuinFileUtil.getConf("spark.speculation.quantile"))
.set("spark.driver,allowMutilpleContext",readApplicatuinFileUtil.getConf("spark.driver.allowMutilpleContext"))
.set("spark.serializer",readApplicatuinFileUtil.getConf("spark.serializer"))
.set("spark.buffer.pageSize",readApplicatuinFileUtil.getConf("spark.buffer.pageSize"))
.set("spark.sql.adaptive.enabled",readApplicatuinFileUtil.getConf("spark.sql.adaptive.enabled"))
.set("spark.streaming.checkpointdir",readApplicatuinFileUtil.getConf("spark.streaming,checkpointdir"))
.set("spark.streaming.kafka.maxRatePerPartition",readApplicatuinFileUtil.getConf("spark.streaming.kafka.maxRatePerPartition"))
.set("spark.streaming.backpressure.enabled",readApplicatuinFileUtil.getConf("spark.streaming.backpressure.enabled"))
.set("spark.streaming.backpressure.initialRate",readApplicatuinFileUtil.getConf("spark.streaming.backpressure"))
.registerKryoClasses(//这里就是做的是kryo序列化,是远远比java的序列化快的多
Array(//序列化我们订单表,司机表,乘客表
classOf[OrderInfo],
classOf[Opt_alliance_business],
classOf[DriverInfo],
classOf[RegisterUsers]
)
)
.setAppName("query")
.setMaster("local[*]")
sparkConf
}
//================================== sparkSession ===========================
def getSparkSesson: SparkSession ={
val sparkSession = new spark.sql.SparkSession.Builder()
.config(getSparkConf)
.config("saprk.scheduler.mode",readApplicatuinFileUtil.getConf("spark.scheduler.mode"))
.config("spark.executor.memoryOverhead",readApplicatuinFileUtil.getConf("spark.executor.memoryOverhead"))
.config("enableSendEmailOnTaskFail","true") //是否开启邮箱报警
.config("spark.extraListeners","com.cartravel.spark.SparkAppListener")//开启离线监控,后面指定你写好的监控类的类路径
.enableHiveSupport() //开启hive支持
.getOrCreate()
sparkSession.sparkContext.setLocalProperty("spark.scheduler.pool","n1")//使用n1这个队列
sparkSession
}
//================================== streamingContext ===========================
def getStreamingContext(sc:SparkContext,duration:Int): StreamingContext ={
val streamingContext: StreamingContext = new StreamingContext(sc,Seconds(duration))
streamingContext
}
}
最后我们要注意这里
sparkSession.config("saprk.scheduler.mode",readApplicatuinFileUtil.getConf("spark.scheduler.mode")) //FAIR
sparkSession.sparkContext.setLocalProperty("spark.scheduler.pool","n1")//使用n1这个队列
当我们选择使用FIFA公平调度器时需要在当前包下的resources中放入
- fairscheduler.xml文件,指定调度池是n1,运行时程序就会到fairscheduler.xml文件中找到n1调度队列,可以得到是FAIR调度模式,按照文件内的资源和权重程序会进行相关的操作
<allocations>
<pool name="n1">
<schedulingMode>FAIR</schedulingMode>
<weight>10</weight>//权重
<minShare>10</minShare>//池资源
</pool>
<pool name="n2">
<schedulingMode>FAIR</schedulingMode>
<weight>4</weight>
<minShare>3</minShare>
</pool>
<pool name="n3">
<schedulingMode>FAIR</schedulingMode>
<weight>1</weight>
<minShare>3</minShare>
</pool>
<pool name="default">
<schedulingMode>FAIR</schedulingMode>
<weight>1</weight>
<minShare>3</minShare>
</pool>
</allocations>