import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.{Seconds, StreamingContext}
object GetMsg {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("GetMsg")
val ssc=new StreamingContext(conf,Seconds(5)) //每5秒读取一次数据
//配置
val topics=Array("testOne")
val kafkaParam=Map[String,Object](
"bootstrap.servers" -> "192.168.75.201:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "group01",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
//读取kafka数据
val message=KafkaUtils.createDirectStream[String,String](
ssc,
PreferConsistent,
Subscribe[String,String](topics,kafkaParam)
)
val lines=message.map(x=>{x.value()})
val wordCount=lines.flatMap(x=>{
x.split(" ").map(x=>(x,1))
}).reduceByKey(_+_)
wordCount.print()
ssc.start()
ssc.awaitTermination()
}
}
/*
auto.offset.reset: 可理解为kafka consumer读取数据的策略,本地用的kafka版本为0.10,因此该参数可填earliest|latest|none。
earliest: 当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
latest: 当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
none: topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
*/