单词计数
package zygDemo1.kafka
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.{HashPartitioner, SparkConf, TaskContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object LoadKafkaDataDemo {
def main(args: Array[String]): Unit = {
//初始化环境
val cpdir="d://zyg-20190716-2"
val strCon=StreamingContext.getOrCreate(cpdir,()=>createContext())
strCon.start()
strCon.awaitTermination()
}
def createContext():StreamingContext={
import java.lang
val conf=new SparkConf().setAppName("LoadKafkaDataDemo").setMaster("local[*]")
val context: StreamingContext = new StreamingContext(conf,Seconds(5))
context.checkpoint("d://zyg-20190716-3")
val topics=Array("test2")
val kafkaParams=Map[String,Object] ("bootstrap.servers" -> "mini1:9092,mini2:9092,mini3:9092",
//key和value的解码方式
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
//指定组名
"group.id" -> "group1",
//消费位置
"auto.offset.reset" -> "latest",
//如果value合法提交offset
"enable.auto.commit" -> (true:lang.Boolean))
//streaming工具类提供的直连方式
val msgs: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
context,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(topics, kafkaParams)
)
//打印offsetRange信息
// msgs.foreachRDD(rdd=>{
// //获取offset集合范围
// val offsetsList:Array[OffsetRange]= rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//
// rdd.foreachPartition(part=>{
// part.foreach(line=>{
// val or: OffsetRange = offsetsList(TaskContext.get().partitionId())
// println("----------topic:"+or.topic)
// println("----------partition:"+or.partition)
// println("----------fromOffset:"+or.fromOffset)
// println("----------topicPartition:"+or.topicPartition())
// println("----------line:"+line)
//
// })
// })
// })
//单词计数
//key为offset,取值
val lines: DStream[String] = msgs.map(_.value())
//具体计数逻辑
val tup: DStream[(String, Int)] = lines.flatMap(_.split(" ")).map((_,1))
//分区器
val partitioner=new HashPartitioner(context.sparkContext.defaultParallelism)
//单词统计(采用历史数据和当前数据聚合计算的原语)
val sumd=tup.updateStateByKey(func,partitioner,true)
sumd.print()
context
}
val func=(it:Iterator[(String,Seq[Int],Option[Int])])=>{
//模式匹配计算单词数据
it.map{
case(x,y,z)=>(x,y.sum+z.getOrElse(0))
}
}
}