导入依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.1.1</version>
</dependency>
Receiver连接方式
- 在kafka-0.8的版本,offset维护在zookeeper中。
- 如果程序停止过程中,kafka生产了数据,那么程序再次启动时,可以消费到停止期间生产的数据。
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(3))
val dataDStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(
ssc,
"hadoop01:2181,hadoop02:2181,hadoop03:2181",
"bigdata0330",
Map[String, Int]("SparkStreaming" -> 2)
)
val resDStream: DStream[(String, Int)] = dataDStream.map(_._2).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
resDStream.print()
ssc.start()
ssc.awaitTermination()
}
Direct连接方式
自动维护offset
- 偏移量会保存到checkpoint目录中。
- 获取StreamingContext的方式需要改变。
- 此种方式会丢失消息。
- 小文件过多。
- 再次执行spark程序时,会从上一次记录的时间戳开始,到当前的时间,将时间段内所有的周期都执行一遍。
def main(args: Array[String]): Unit = {
val ssc: StreamingContext = StreamingContext.getActiveOrCreate(
"D:\\develop\\workspace\\bigdata2021\\spark2021\\checkpoint",
() => createContext()
)
ssc.start()
ssc.awaitTermination()
}
def createContext(): StreamingContext = {
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(3))
ssc.checkpoint("D:\\develop\\workspace\\bigdata2021\\spark2021\\checkpoint")
val kafkaParams = Map[String,String](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "bigdata0330"
)
val dataDStream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
kafkaParams,
Set("SparkStreaming")
)
val resDStream: DStream[(String, Int)] = dataDStream.map(_._2).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
resDStream.print()
ssc
}
手动维护offset
- 实际项目中,为了保证数据的准确一致性,将消费后的offset保存到有事务的介质中,比如MySQL。
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(3))
val kafkaParams = Map[String,String](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "bigdata0330"
)
val fromOffsets = Map[TopicAndPartition,Long](
TopicAndPartition("SparkStreaming", 0) -> 10L,
TopicAndPartition("SparkStreaming", 1) -> 10L
)
val dataDStream: InputDStream[String] = KafkaUtils
.createDirectStream[String,String,StringDecoder,StringDecoder,String](
ssc,
kafkaParams,
fromOffsets,
(m: MessageAndMetadata[String, String]) => m.message()
)
var ranges: Array[OffsetRange] = Array.empty[OffsetRange]
dataDStream.transform {
rdd => {
ranges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
}.foreachRDD(
rdd => {
for (o <- ranges) {
println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
}
}
)
ssc.start()
ssc.awaitTermination()
}