从kafka中读取数据,用Spark streaming进行实时分析。
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
//有状态
object Test2_kafka_streaming_stateful {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf=new SparkConf().setMaster("local[*]").setAppName("NetWorkWordCount")
val ssc=new StreamingContext(conf,Seconds(2))//2秒
以后是一个hdfs 对于窗口和有状态的操作必须checkpoint,通过StreamingContext的checkpoint来指定目录,
ssc.checkpoint("./check")
val kafkaParams=Map[String,Object](
"bootstrap.servers"->"node1:9092,node2:9092,node3:9093",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "streaming74", //消费者组编号
"auto.offset.reset" -> "latest", //latest从头开始读
"enable.auto.commit" -> (true: java.lang.Boolean) //消息位移自动提交
)
//要订阅的主题
val topics=Array("topicAA")
//创建DStream
val stream=KafkaUtils.createDirectStream[String,String](
ssc,
PreferConsistent,
Subscribe[String,String](topics,kafkaParams)
)
val lines:DStream[String]=stream.map(record=>(record.value()))
val words=lines.flatMap(_.split(" "))
val wordAndOne=words.map((_,1))
// val reduced=wordAndOne.reduceByKey(_+_)
val reduced= wordAndOne.updateStateByKey(updateFunc,new HashPartitioner(ssc.sparkContext.defaultMinPartitions),true)
reduced.print()
ssc.start()
ssc.awaitTermination()
}
/**
* iter: 当前操作的RDD
* String: 聚合的key
* Seq[Int]: 在这个批次中此key在这个分区出现的次数集合 [1,1,1,1,1].sum()
* Option[Int]:初始值或累加值 Some None-> 模式匹配
*/
val updateFunc=(iter:Iterator[(String,Seq[Int],Option[Int])])=>{
//方案一:
// iter.map(t=>(t._1,t._2.sum+t._3.getOrElse(0)))
//方案二:模式匹配来实现
iter.map{ case(x,y,z)=>(x,y.sum+z.getOrElse(0))}
}
}