SparkStreaming从Kafka中读取数据,首先要安装好Kafka集群。
Kafka的原理和详细安装步骤请参考博客:
安装步骤:https://blog.csdn.net/weixin_43866709/article/details/88978954
原理:https://blog.csdn.net/weixin_43866709/article/details/88989349
创建DStream可以直接使用KafkaUtils工具类:
package XXX
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Create by 。。。
*
*/
object KafkaWordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[4]")
val ssc = new StreamingContext(conf,Seconds(5))
//zookeeper集群位置
val zkQuorum = "L1:2181,L2:2181,L3:2181"
val groupId = "g1"
val topic = Map[String,Int]("tianmao" -> 1)
//创建DStream,需要KafkaDStream
val data: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc,zkQuorum,groupId,topic)
//对数据进行处理
//Kafka的ReceiverInputDStream[(String,String)]里面装的是一个元组(key是写入的key,value是实际写入的数据)
//取出数据
val lines: DStream[String] = data.map(_._2)
//切分压平
val words: DStream[String] = lines.flatMap(_.split(" "))
//将单词和1组合在一起
val wordAndOne: DStream[(String, Int)] = words.map((_,1))
//按key进行聚合
val reduced: DStream[(String, Int)] = wordAndOne.reduceByKey(_+_)
//打印结果(Action)
reduced.print()
//启动sparkStreaming程序
ssc.start()
//等待优雅的退出
ssc.awaitTermination()
}
}