依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.3</version>
</dependency>
代码
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
object KafkaSink extends Serializable {
def createKafkaConnection(): KafkaProducer[String, String] = {
val props = new Properties()
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"CentOS:9092")
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,classOf[StringSerializer].getName)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,classOf[StringSerializer].getName)
props.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG,"true")
props.put(ProducerConfig.RETRIES_CONFIG,"2")
props.put(ProducerConfig.BATCH_SIZE_CONFIG,"100")
props.put(ProducerConfig.LINGER_MS_CONFIG,"1000")
new KafkaProducer[String,String](props)
}
lazy val kafkaProducer:KafkaProducer[String,String]= createKafkaConnection()
Runtime.getRuntime.addShutdownHook(new Thread(){
override def run(): Unit = {
kafkaProducer.close()
}
})
def save(vs: Iterator[(String, Int)]): Unit = {
try{
vs.foreach(tuple=>{
val record = new ProducerRecord[String,String]("topic02",tuple._1,tuple._2.toString)
kafkaProducer.send(record)
})
}catch {
case e:Exception=> println("发邮件,出错啦~")
}
}
}
val checkpointDir="file:///D:/checkpointdir"
val ssc=StreamingContext.getOrCreate(checkpointDir,()=>{
println("==========init ssc==========")
val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster("local[6]")
val ssc = new StreamingContext(sparkConf, Seconds(2))
ssc.checkpoint(checkpointDir)
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "CentOS:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "g1",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])
val messages = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](List("topic01"), kafkaParams))
messages.map(record=>record.value)
.flatMap(line=>line.split(" "))
.map(word => (word, 1))
.mapWithState(StateSpec.function((k:String,v:Option[Int],stage:State[Int])=>{
var total:Int=0
if(stage.exists()){
total=stage.getOption().getOrElse(0)
}
total += v.getOrElse(0)
stage.update(total)
(k,total)
}))
.foreachRDD(rdd=>{
rdd.foreachPartition(vs=>{
KafkaSink.save(vs)
})
})
ssc
})
ssc.sparkContext.setLogLevel("FATAL")
ssc.start()
ssc.awaitTermination()
}