由于历史原因,导致有两套API,所以有两种集成方式.new Consumer API从kafka 0.10.0版本开始
新版本API
import org.apache.commons.lang3.time.FastDateFormat
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{
DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{
ConsumerStrategies, ConsumerStrategy, KafkaUtils, LocationStrategies, LocationStrategy}
import org.apache.spark.streaming.{
Seconds, StreamingContext}
/**
* 基于IDEA集成开发环境,编程实现从Kafka Topic实时读取流式数据,对每批次中数据进行词频统计。
* TODO: 采用 Kafka New Consumer API -> 类似Direct 方式,每批次数据依据偏移量范围获取
*/
object StreamingSourceKafka {
def main(args: Array[String]): Unit = {
// TODO: 1. 创建StreamingContext流式上下文对象,传递时间间隔
val ssc: StreamingContext = {
// a. 创建SparkConf对象,设置应用属性,比如名称和master
val sparkConf: SparkConf = new SparkConf()
.setAppName(this.getClass.getSimpleName.stripSuffix("$"))
.setMaster("local[3]") // 启动3个Thread线程
// TODO: 设置每批次RDD中各个分区数据的最大值 -> 每个分区每秒的最大数据量
.set("spark.streaming.kafka.maxRatePerPartition", "10000")
// b. 设置批处理时间间隔BatchInterval:5秒
new StreamingContext(sparkConf, Seconds(5))
}
// 2. 采用New Consume API消费Kafka Topic数据
/*
def createDirectStream[K, V](
ssc: StreamingContext,
locationStrategy: LocationStrategy,
consumerStrategy: ConsumerStrategy[K, V]
): InputDStream[ConsumerRecord[K, V]]
*/
// a. 消费Kakfa数据时:位置策略
val locationStrategy: LocationStrategy = LocationStrategies.PreferConsistent
// b. 消费Kafka数据时:消费策略
/*
def Subscribe[K, V](
topics: Iterable[jl.String],
kafkaParams: collection.Map[String, Object]
): ConsumerStrategy[K, V]
*/
// i. 消费Topic名称
val topics:</