spark streaming 读取kafka数据
1、程序入口
val spark = SparkSession.builder().appName(this.getClass.getName).master("local[*]").getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val ssc = new StreamingContext(spark.sparkContext, Minutes(bach_time))
2、读取kafka(参数配置)
val comsumerGroup = "demo01_test001"
val template_topic = "cn_number"
val callback_topic = "cn_data"
val topics = Array(template_topic, callback_topic)
val brokers = "hadoop101:9092,hadoop102:9092,hadoop103:9092"
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "earliest", //latest earliest
"group.id" -> comsumerGroup,
"enable.auto.commit" -> (false: lang.Boolean),
"session.timeout.ms" -> 90000.asInstanceOf[Object],
"heartbeat.interval.ms" -> 9000.asInstanceOf[Object],
"request.timeout.ms" -> 100000.asInstanceOf[Object],
"max.partition.fetch.bytes" -> 10240000.asInstanceOf[Object],
"max.poll.records" -> 800.asInstanceOf[Object]
)
//kafka 设置kafka读取topic
//获取数据流
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils
.createDirectStream(ssc, LocationStrategies.PreferConsistent, Subscribe[String, String](topics, kafkaParams))
3、操作数据:(手动维护偏移量,处理完提交偏移量)
stream.foreachRDD(rdd1 => {
val offsetRanges = rdd1.asInstanceOf[HasOffsetRanges].offsetRanges
............................................
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)}
ssc.start()
ssc.awaitTermination()