智慧出行-sparkConf,sparkSesson,streamingContext的生产必配配置,我把他做成了一个工具类,具体实现文章中会详细描述

13 篇文章 0 订阅
12 篇文章 2 订阅

1.application.conf

# sparkConf配置
spark.worker.timeout="500" # 1.spark的work超时时间
spark.rpc.askTimeout="600s" # 2.rpc问答超时时间
spark.network.timeout="600s" # 3. network超时时间
spark.cores.max = "10" # 4.可以用的最大cores
spark.task.maxFailures="5" # 5.task任务失败最大重试次数
spark.speculation="true" # 6.开启推测执行
spark.speculation.interval="300s" # 推测执行间隔300秒
# 推测执行分为值,当task任务执行达到90%,我们才会上推测执行,把task任务移动到其他节点上去
spark.speculation.quantile="0.9"
spark.driver.allowMutilpleContext="true" # 7.是否允许有多个driver并行执行
spark.serializer="org.apache.spark.serializer.KryoSerializer" # 8.序列化
spark.buffer.pageSize="200m" # 9.缓存页
spark.sql.adaptive.enabled="true" # 10.sql自适应
spark.streaming.checkpointdir="hdfs://node01:9000/streaming" # 11.检查点存放位置
spark.streaming.kafka.maxRatePerPartition="1000" # 12.限流
spark.streaming.backpressure.enabled="true" # 13.开启压背
spark.streaming.backpressur.initialRate="10" # 14.初始化压背
spark.streaming.concurrentJobs=5 # 15.并行度


# sparkSession配置
spark.scheduler.mode="FAIR" # 1.工程调度模式:公平调度器FAIR,先进先出调度器FIFO,容量调度器
spark.executor.memoryOverhead="512" # 2.堆外内存

2.添加maven依赖

<dependency>
 <!-- 就个就是实现scala读取resources配置文件application.conf所要使用的依赖jar包-->
  <groupId>com.typesafe</groupId>
  <artifactId>config</artifactId>
  <version>1.2.1</version>
</dependency>
	
<dependency>
	<!--sparkSQL依赖-->
	<groupId>org.apache.spark</groupId>
	<artifactId>spark-sql_2.11</artifactId>
	<version>2.1.0</version>
	<!--<version>${spark.version}</version>-->
</dependency>
<dependency>
	<!--sparkStreaming依赖-->
	<groupId>org.apache.spark</groupId>
	<artifactId>spark-streaming_2.11</artifactId>
	<version>2.1.0</version>
</dependency>

3.读取配置文件工具类

package com.cartravel.common.readApplicationconfUtil

import com.typesafe.config.{Config, ConfigFactory}

class readApplicatuinFileUtil {
  private val config: Config = ConfigFactory.load()
  def getConf:String=>String=(args:String)=>config.getString(args)

}
object readApplicatuinFileUtil extends readApplicatuinFileUtil

4.整合上述的application和读取配置文件工具类,创建sparkConf,以及saprkSesson,streamingContext的工具类

package com.cartravel.sparkInitialization

import com.cartravel.bean.{DriverInfo, Opt_alliance_business, OrderInfo, RegisterUsers}
import com.cartravel.readApplicationconfUtil.readApplicatuinFileUtil
import org.apache.spark
import org.apache.spark.{SparkConf, SparkContext, sql}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}


object sparkInitialization {

//==================================  sparkConf  ===========================
  def getSparkConf: SparkConf ={
    val sparkConf: SparkConf = new SparkConf()
      .set("spark.worker.timeout",readApplicatuinFileUtil.getConf("spark.worker.timeout"))
      .set("spark.rpc.askTimeout",readApplicatuinFileUtil.getConf("spark.rpc.askTimeout"))
      .set("spark.network.timeout",readApplicatuinFileUtil.getConf("spark.network.timeout"))
      .set("spark.cores.max",readApplicatuinFileUtil.getConf("spark.cores.max"))
      .set("spark.task.maxFailures",readApplicatuinFileUtil.getConf("spark.task.maxFailures"))
      .set("spark.speculation",readApplicatuinFileUtil.getConf("saprk.speculation"))
      .set("spark.sqeculation.interval",readApplicatuinFileUtil.getConf("spark.speculation.interval"))
      .set("saprk.speculation.quantile",readApplicatuinFileUtil.getConf("spark.speculation.quantile"))
      .set("spark.driver,allowMutilpleContext",readApplicatuinFileUtil.getConf("spark.driver.allowMutilpleContext"))
      .set("spark.serializer",readApplicatuinFileUtil.getConf("spark.serializer"))
      .set("spark.buffer.pageSize",readApplicatuinFileUtil.getConf("spark.buffer.pageSize"))
      .set("spark.sql.adaptive.enabled",readApplicatuinFileUtil.getConf("spark.sql.adaptive.enabled"))
      .set("spark.streaming.checkpointdir",readApplicatuinFileUtil.getConf("spark.streaming,checkpointdir"))
      .set("spark.streaming.kafka.maxRatePerPartition",readApplicatuinFileUtil.getConf("spark.streaming.kafka.maxRatePerPartition"))
      .set("spark.streaming.backpressure.enabled",readApplicatuinFileUtil.getConf("spark.streaming.backpressure.enabled"))
      .set("spark.streaming.backpressure.initialRate",readApplicatuinFileUtil.getConf("spark.streaming.backpressure"))
      .registerKryoClasses(//这里就是做的是kryo序列化,是远远比java的序列化快的多
        Array(//序列化我们订单表,司机表,乘客表
          classOf[OrderInfo],
          classOf[Opt_alliance_business],
          classOf[DriverInfo],
          classOf[RegisterUsers]

        )

       )

      .setAppName("query")
      .setMaster("local[*]")

    sparkConf
  }


//==================================  sparkSession  ===========================
  def getSparkSesson: SparkSession ={
    val sparkSession = new spark.sql.SparkSession.Builder()
      .config(getSparkConf)
      .config("saprk.scheduler.mode",readApplicatuinFileUtil.getConf("spark.scheduler.mode"))
      .config("spark.executor.memoryOverhead",readApplicatuinFileUtil.getConf("spark.executor.memoryOverhead"))
      .config("enableSendEmailOnTaskFail","true") //是否开启邮箱报警
      .config("spark.extraListeners","com.cartravel.spark.SparkAppListener")//开启离线监控,后面指定你写好的监控类的类路径
      .enableHiveSupport() //开启hive支持
      .getOrCreate()
    sparkSession.sparkContext.setLocalProperty("spark.scheduler.pool","n1")//使用n1这个队列

    sparkSession
  }


//==================================  streamingContext  ===========================
  def getStreamingContext(sc:SparkContext,duration:Int): StreamingContext ={
    val streamingContext: StreamingContext = new StreamingContext(sc,Seconds(duration))
    streamingContext

  }
}

最后我们要注意这里

sparkSession.config("saprk.scheduler.mode",readApplicatuinFileUtil.getConf("spark.scheduler.mode")) //FAIR
sparkSession.sparkContext.setLocalProperty("spark.scheduler.pool","n1")//使用n1这个队列

当我们选择使用FIFA公平调度器时需要在当前包下的resources中放入
在这里插入图片描述在这里插入图片描述

  • fairscheduler.xml文件,指定调度池是n1,运行时程序就会到fairscheduler.xml文件中找到n1调度队列,可以得到是FAIR调度模式,按照文件内的资源和权重程序会进行相关的操作
<allocations>
    <pool name="n1">
    	<schedulingMode>FAIR</schedulingMode>
    	<weight>10</weight>//权重
    	<minShare>10</minShare>//池资源
    </pool>
    <pool name="n2">
        <schedulingMode>FAIR</schedulingMode>
        <weight>4</weight>
        <minShare>3</minShare>
    </pool>
    <pool name="n3">
        <schedulingMode>FAIR</schedulingMode>
        <weight>1</weight>
        <minShare>3</minShare>
    </pool>
    <pool name="default">
        <schedulingMode>FAIR</schedulingMode>
        <weight>1</weight>
        <minShare>3</minShare>
    </pool>
</allocations>

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,下面是一个简单的示例代码: ```scala import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.kafka010._ object KafkaConsumer { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("KafkaConsumer") val ssc = new StreamingContext(conf, Seconds(5)) val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "localhost:9092", "key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", "group.id" -> "test-consumer-group", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) val topics = Array("test") val stream = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) stream.flatMap(record => record.value.split(" ")) .map(word => (word, 1)) .reduceByKey(_ + _) .print() ssc.start() ssc.awaitTermination() } } ``` 这个程序使用 Spark Streaming 作为 Kafka 的消费者,消费了一个名为 "test" 的 Kafka 主题中的数据,并对数据进行了简单的单词计数。在这个示例中,我们使用了直接流(Direct Stream)来消费 Kafka 中的数据。我们也可以使用 Receiver-based Stream 来消费 Kafka 中的数据,但是 Receiver-based Stream 消耗的资源更多,因此在生产环境下推荐使用直接流。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值