广告黑名单
实现实时的动态黑名单机制:将每天对某个广告点击超过 100 次的用户拉黑。
注:黑名单保存到redis中。
1 思路分析
1)读取Kafka数据之后,并对Redis存储的黑名单数据做校验;
2)校验通过则对给用户点击广告次数累加一并存入Redis;
3)在存入Redis之前对数据做校验,如果单日超过100次则将该用户加入黑名单。
2 环境准备
接下来开始实时需求的分析,需要用到SparkStreaming来做实时数据的处理,在生产环境中,绝大部分时候都是对接的Kafka数据源,这里也一样,所以先在common模块下创建一个SparkStreaming读取Kafka数据的工具类。
1)MyKafkaUtil
package com.atguigu.utils
import java.util.Properties
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object MyKafkaUtil {
//1.创建配置信息对象
private val properties: Properties = PropertiesUtil.load(“config.properties”)
//2.用于初始化链接到集群的地址
val broker_list: String = properties.getProperty(“kafka.broker.list”)
//3.kafka消费者配置
val kafkaParam = Map(
“bootstrap.servers” -> broker_list,
“key.deserializer” -> classOf[StringDeserializer],
“value.deserializer” -> classOf[StringDeserializer],
//消费者组
“group.id” -> “commerce-consumer-group”,
//如果没有初始化偏移量或者当前的偏移量不存在任何服务器上,可以使用这个配置属性
//可以使用这个配置,latest自动重置偏移量为最新的偏移量
“auto.offset.reset” -> “latest”,
//如果是true,则这个消费者的偏移量会在后台自动提交,但是kafka宕机容易丢失数据
//如果是false,会需要手动维护kafka偏移量
“enable.auto.commit” -> (true: java.lang.Boolean)
)
// 创建DStream,返回接收到的输入数据
// LocationStrategies:根据给定的主题和集群地址创建consumer
// LocationStrategies.PreferConsistent:持续的在所有Executor之间分配分区
// ConsumerStrategies:选择如何在Driver和Executor上创建和配置Kafka Consumer
// ConsumerStrategies.Subscribe:订阅一系列主题
def getKafkaStream(topic: String, ssc: StreamingContext): InputDStream[ConsumerRecord[String, String]] = {
val dStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Array(topic), kafkaParam))
dStream
}
}
2)RedisUtil
package com.atguigu.utils
import java.util.Properties
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
object RedisUtil {
var jedisPool: JedisPool = _
def getJedisClient: Jedis = {
if (jedisPool == null) {
println(“开辟一个连接池”)
val properties: Properties = PropertiesUtil.load(“config.properties”)
val host: String = properties.getProperty(“redis.host”)
val port: String = properties.getProperty(“redis.port”)
val jedisPoolConfig = new JedisPoolConfig()
jedisPoolConfig.setMaxTotal(100) //最大连接数
jedisPoolConfig.setMaxIdle(20) //最大空闲
jedisPoolConfig.setMinIdle(20) //最小空闲
jedisPoolConfig.setBlockWhenExhausted(true) //忙碌时是否等待
jedisPoolConfig.setMaxWaitMillis(500) //忙碌时等待时长 毫秒
jedisPoolConfig.setTestOnBorrow(true) //每次获得连接的进行测试
jedisPool = new JedisPool(jedisPoolConfig, host, port.toInt)
}
println(s”jedisPool.getNumActive = ${jedisPool.getNumActive}”)
println(“获得一个连接”)
jedisPool.getResource
}
}
3 代码实现
1)Ads_log
case class Ads_log(timestamp: Long, area: String, city: String, userid: String, adid: String)
2)BlackListApp
object BlackListHandler {
val countKey = “day:userid:adsid”
val blackList = “blackList”
/**
* 把在黑名单中的用户的点击的广告记录给去除
*
* @param adsInfoDStream
*/
def filterBlackList(adsInfoDStream: DStream[AdsInfo], sc: SparkContext) = {
adsInfoDStream.transform {
rdd => {
val client: Jedis = RedisUtil.getJedisClient
// 1. 先拿到黑名单
val blackUids: util.Set[String] = client.smembers(blackList)
client.close()
// 把黑名单使用广播变量.
val blackListBC: Broadcast[util.Set[String]] = sc.broadcast(blackUids)
// 2. 过滤
rdd.filter {
info => {
!blackListBC.value.contains(info.userId)
}
}
}
}
}
/**
* 检测用户是否添加到黑名单中
* @param adsInfoDStream
*/
def checkUserToBlackList(adsInfoDStream: DStream[AdsInfo]) = {
adsInfoDStream.foreachRDD(rdd => {
rdd.foreachPartition(infoIt => {
val jedisClient: Jedis = RedisUtil.getJedisClient
infoIt.foreach(info => {
// 1. 每个用户每天对每个广告的点击次数写入了redis
val field = s”${info.dayString}:${info.userId}:${info.adsId}”
jedisClient.hincrBy(countKey, field, 1L)
// 2. 判断次数是否超过了阈值, 超过则写入到黑名单中
if (jedisClient.hget(countKey, field).toLong >= 100000) {
jedisClient.sadd(blackList, info.userId)
}
})
jedisClient.close()
})
})
}
}
3)RealtimeApp
object RealtimeApp {
def main(args: Array[String]): Unit = {
// 从kafka中读出我们需要数据
// 1. 创建 SparkConf 对象
val conf: SparkConf = new SparkConf()
.setAppName(“RealTimeApp”)
.setMaster(“local[*]”)
// 2. 创建 SparkContext 对象
val sc = new SparkContext(conf)
// 3. 创建 StreamingContext
val ssc = new StreamingContext(sc, Seconds(2))
// 4. 得到 DStream
val recordDStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getDStream(“ads_log”,ssc)
// 5. 为了方便后面的计算, 把消费到的字符串封装到对象中
val adsInfoDStream: DStream[AdsInfo] = recordDStream.map {
record =>
val split: Array[String] = record.value.split(“,”)
Ads_log(split(0).toLong, split(1), split(2), split(3), split(4))
}
// 6: 需求5:
val filteredDStream: DStream[AdsInfo] = BlackListHandler.filterBlackList(adsInfoDStream, sc)
BlackListHandler.checkUserToBlackList(filteredDStream)
ssc.start()
ssc.awaitTermination()
}
}