使用Spark从kafka中读取到数据后,得到的是5秒一个批次的数据
val conf = new SparkConf().setAppName("dau_app").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(5))
// 连接Kafka,读取日志数据,返回的结果是ConsumerRecord[String,String]的集合,其中key为null,value为真实的数据
val inputDstream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream(GmallConstant.KAFKA_TOPIC_STARUP, ssc)
// 2 结构转换成case class 补充两个时间字段
val startupLogDstream: DStream[StartupLog] = inputDstream.map { record =>
val jsonString: String = record.value()
val startupLog: StartupLog = JSON.parseObject(jsonString, classOf[StartupLog])
val formatter = new SimpleDateFormat("yyyy-MM-dd HH")
val datetimeStr: String = formatter.format(new Date(startupLog.ts))
val dateTimeArr: Array[String] = datetimeStr.split(" ")
startupLog.logDate = dateTimeArr(0)
startupLog.logHour = dateTimeArr(1)
startupLog
}
--------------------------利用redis去重逻辑-----------------------------------------------
// 4 利用redis进行去重过滤,方案一
startupLogDstream.filter{startupLog=>// 此处对于每一条记录都会和redis建立连接对性能消耗较大
val jedis: Jedis = RedisUtil.getJedisClient
val key = "dau" + startupLog.logDate
!jedis.sismember(key,startupLog.mid)
}
// 4 利用redis进行去重过滤,方案二(错误方法),dauBC广播变量只初始化了一次
val curdate:String = new SimpleDateFormat("yyyy-MM-dd").format(new Date)
val jedis: Jedis = RedisUtil.getJedisClient
val key = "dau:" + curdate
val dauSet:util.Set[String] = jedis.smembers(key)
val dauBC:BroadCast[util.Set[String]] = ssc.sparkContext.broadcast(dauSet)
startupLogDStream.filter{startuplog=>
val dauSet: util.Set[String] = dauBC.value
!dauSet.contains(startuplog.mid)
}
// 4 利用redis进行去重过滤,方案三
// transform算子,外面一半在driver中执行,里面一半在executor中执行
val filteredDStream :DStream[Startuplog] = startupLogDStream.transform{rdd=>
println("过滤前:"+rdd.count())
// driver中执行 周期性执行
val curdate:String = new SimpleDateFormat("yyyy-MM-dd").format(new Date)
val jedis: Jedis = RedisUtil.getJedisClient
val key = "dau:" + curdate
val dauSet:util.Set[String] = jedis.smembers(key)
val dauBC:BroadCast[util.Set[String]] = ssc.sparkContext.broadcast(dauSet)
val filteredRDD:RDD[Startuplog] = rdd.filter{startuplog=>
// executor中执行
val dauSet: util.Set[String] = dauBC.value
!dauSet.contains(startuplog.mid)
}
println("过滤前:"+filteredRDD.count())
filteredRDD
}
// 存在问题:如果在5秒内同一用户多次登录的话,则无法进行过滤,因为redis中还未保存这个用户
// 解决方法:按照key进行分组,将相同的mid分到同一个组中,取每组第一个
val groupByMidDstream: DStream[(String,Iterable[StartupLog])] = filteredDstream.map(startuplog => (startuplog.mid, startuplog)).groupByKey()
val value:DStream[Iterable[Startuplog]] = groupByMidDstream.map{case(mid,startuplogIter)=>
startuplogIter.take(1)
}
// flatmap 将Iterable集合打碎了
val distinctDstream:DStream[Startuplog] = groupByMidDstream.flatMap { case (mid, startupLogItr) =>
startupLogItr.take(1)
}
------------------------------保存数据逻辑功能----------------------------------------------
// 3 将数据保存到redis中:方案一
startupLogDStream.foreachRDD{rdd=>// 此处是在driver中
// redis 对于日活对用户进行过滤使用的是set集合
rdd.foreach{startupLog=> // 此处是在executor中执行,而jedis是在driver中创建的,所以在executor中是无法是无法使用jedis的引用的,但是如果在此处创建redis连接,则会在每一条记录都会创建连接,然后释放连接
val jedis:Jedis = RedisUtil.getJedisClient
val key = startupLog.logDate
val value = startupLog.mid
jedis.sadd(key,value)
jedis.close()
}
}
// 将数据保存到redis中:方案二
startupLogDStream.foreachRDD{rdd=>// 此处是在driver中
rdd.foreachPartition{startuplogIter=>// 此处是在executor中进行执行,相对于方案一对每一条记录进行进行创建redis连接有了很大的性能提升
val jedis:Jedis = RedisUtil.getJedisClient
for(startupLog:startuplogIter){
val key = startupLog.logDate
val value = startupLog.mid
jedis.sadd(key,value)
}
jedis.close()
}
}