1、代码编写
(1)利用redis进行去重过滤基本代码实现
//利用redis进行去重过滤
val filteredDstream: DStream[Startuplog] = startuplogStream.transform {
rdd =>
println("过滤前:"+rdd.count())
//driver执行:周期执行
val curdate: String = new SimpleDateFormat("yyyy-MM-dd").format(new Date)
val jedis: Jedis = RedisUtil.getJedisClient
val key = "dau:" + curdate
val dauSet: util.Set[String] = jedis.smembers(key)
val dauBC: Broadcast[util.Set[String]] = ssc.sparkContext.broadcast(dauSet)
val filteredRDD: RDD[Startuplog] = rdd.filter {
startuplog =>
//executor执行
val dauSet: util.Set[String] = dauBC.value
!dauSet.contains(startuplog.mid)
}
println("过滤后:"+filteredRDD.count())
//返回值
filteredRDD
}
完整代码实现
package com.study.gmall0315.realtime.app
import java.text.SimpleDateFormat
import java.util
import java.util.Date
import com.alibaba.fastjson.JSON
import com.study.gmall0315.common.constant.GmallConstant
import com.study.gmall0315.realtime.bean.Startuplog
import com.study.gmall0315.realtime.util.{
MyKafkaUtil, RedisUtil}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{
DStream, InputDStream}
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import redis.clients.jedis.Jedis
object DauApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("dau_app").setAppName("local[*]")
val ssc = new StreamingContext(