第一步
依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.3</version>
</dependency>
drver编写
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DirectKafkaWordCount {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster("local[6]")
val ssc = new StreamingContext(sparkConf, Seconds(2))
ssc.sparkContext.setLogLevel("FATAL")
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "CentOS:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "g1",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])
val messages = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](List("topic01"), kafkaParams))
messages.map(record=>record.value)
.flatMap(line=>line.split(" "))
.map(word => (word, 1L))
.reduceByKey(_ + _)
.print()
ssc.start()
ssc.awaitTermination()
}
}
启动 kafka 测试
./bin/kafka-server-start.sh -daemon config/server.properties3628
> Kafka
./bin/kafka-topics.sh --zookeeper spark:2181 --create --topic topic01 --partitions 1 --replication-factor 1
Created topic "topic01".
./bin/kafka-console-producer.sh --broker-list spark:9092 --topic topic01
>d
>hellp oo oo
>
控制台输出
Time: 1565955684000 ms
-------------------------------------------
(oo,2)
(hellp,1)
---------------------