SpearkStreaming
spark-kafka联调
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.2</version>
</dependency>
package com.song.bigdata.stream
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Spark_Kafka {
def main(args: Array[String]): Unit = {
//spark的配置对象
val sparkConf = new SparkConf().setMaster("local").setAppName("spreakstr")
//实时数据分析环境对象
//采集周期:以指定的时间为周期采集实时数据
val streamingContext = new StreamingContext(sparkConf, Seconds(3))
//从指定的端口中采集数据
//val socketLineDStream = streamingContext.socketTextStream("linux1", 9999)
//从kafka中采集数据
val kafkaDStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(
streamingContext,
"linux1:2181",
"songGroup",
Map("sparkTopic" -> 3) //3个分区,要有topic
//bin/kafka-topics.sh --zookeeper linux:2181 --list
//bin/kafka-topics.sh --zookeeper linux:2181 --create --topic sparkTopic --partitions 3 --replication-factor 2
)
//将采集的数据进行分解(扁平化)
val wordDStream = kafkaDStream.flatMap(t => t._2.split(" "))
//将数据进行结构的转换方便统计分析
val mapDStream = wordDStream.map((_, 1))
//将转换结构后的数据进行聚合处理
val wordToSumDStream = mapDStream.reduceByKey(_ + _)
//将结果打印出来
println(wordToSumDStream)
//启动采集器
streamingContext.start()
//Driver等待采集器的执行
streamingContext.awaitTermination()
}
}