importjava.net.URLDecoderimportjava.sql.{Connection, DriverManager}importcom.spark.common.{EventLogConstants, LoggerUtil, Test, TimeUtil}importkafka.serializer.StringDecoderimportorg.apache.hadoop.hbase.client.{ConnectionFactory, Put}importorg.apache.hadoop.hbase.util.Bytesimportorg.apache.hadoop.hbase.{HBaseConfiguration, TableName}importorg.apache.log4j.Loggerimportorg.apache.spark.streaming.dstream.DStreamimportorg.apache.spark.streaming.kafka.KafkaUtilsimportorg.apache.spark.streaming.{Seconds, StreamingContext}importorg.apache.spark.{SparkConf, SparkContext}importscala.collection.immutable.HashMap
object SxRlStatDemoextendsSerializable {
val logger=Logger.getLogger(classOf[LoggerUtil])private val serialVersionUID = -4892194648703458595Ldef main(args: Array[String]): Unit={
val conf= newSparkConf()
conf.setMaster("local[2]").setAppName("sxdemo")
.set("spark.streaming.kafka.maxRatePerPartition", "100")
.set("spark.streaming.backpressure.enabled", "true")//开启被压
val sc =SparkContext.getOrCreate(conf)
val ssc= new StreamingContext(sc, Seconds(1))//二、DStream的构建//kafka的Simple consumer API的连接参数, 只有两个//metadata.broker.list: 给定Kafka的服务器路径信息//auto.offset.reset:给定consumer的偏移量的值,largest表示设置为最大值,smallest表示设置为最小值(最大值&最小值指的是对应的分区中的日志数据的偏移量的值) ==> 每次启动都生效
val kafkaParams =Map[String, String]("metadata.broker.list" -> "hadoop04:9092,hadoop05:9092,hadoop06:9092","auto.offset.reset" -> "largest","key.serializer" -> "org.apache.kafka.common.serialization.StringSerializer","value.serializer" -> "org.apache.kafka.common.serialization.StringSerializer")//"spark.serializer"->"org.apache.spark.serializer.KryoSerializer")//给定一个由topic名称组成的set集合
val topics = Set("topic_bc")
val stream=KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics).map(_._2)//.mapog => {//
//})
.transform(rdd =>{
rdd.map(log=>{
var map: Map[String, String]= newHashMap[String, String]
val splits= log.split("\\^A")if (splits.length==3){
val ip= splits(0).trim
val nginxTime= TimeUtil.parseNginxServerTime2Long(splits(1).trim).toString;if (nginxTime != "-1") {
nginxTime.toString
}
val requestStr= splits(2)
val index= requestStr.indexOf("?")if (index > -1) { //有请求参数的情况下,获取?后面的参数
val requestBody: String = requestStr.substring(index + 1)
var areaInfo= if (ip.nonEmpty) Test.getInfo(ip) else Array("un", "un", "un")
val requestParames= requestBody.split("&")for (e
val index= e.indexOf("=")if (index < 1) {
logger.debug("次日志无法解析")
}
var key= ""; var value = "";
key= e.substring(0, index)
value= URLDecoder.decode(e.substring(index + 1), EventLogConstants.LOG_PARAM_CHARSET)
map.+=(key ->value)
}
map.+=("ip" -> ip, "s_time" -> nginxTime, "country" -> areaInfo(0), "provence" -> areaInfo(1), "city" -> areaInfo(2))
}else{ logger.debug("次日志无法解析")}
}
map
})
})
stream.cache()
ssc.checkpoint("checkpoint")
val bc_personAmt= stream.filter(log => log.contains("en") && log("en") == "e_sx")//combine_map.get("test_101").getOrElse("不存在")//根据key取value值,如果不存在返回后面的值//scala> a.get(1)//res0: Option[Int] = Some(2) get返回的是Option[Int]类型 不可能等于" " ==Some("e_la")
.map(log => (log("bc_person"), 1))
.updateStateByKey[Long]((seq: Seq[Int], state: Option[Long])=>{//seq:Seq[Long] 当前批次中每个相同key的value组成的Seq
val currentValue =seq.sum//state:Option[Long] 代表当前批次之前的所有批次的累计的结果,val对于wordcount而言就是先前所有批次中相同单词出现的总次数
val preValue = state.getOrElse(0L)
Some(currentValue+preValue)
})