做什么?
从Kafka获取实时数据,对每个用户的点击次数进行累加并写入MySQL,当一天之内一个用户对一个广告的点击次数超过100次时,将用户加入黑名单中。
需求解析
1.先从kafka得到数据,数据格式是:(timestamp province city userid adid),
2.接着统计该批次的数据中不同用户对同一个广告的点击数量,即需要以timestamp_userId_adid为key进行reduceByKey
3.更新mysql
4.从mysql中读取数据,过滤数据,剩下的是超过100次点击的数据
5.更新Mysql的黑名单表
步骤解析
1.从Kafka中获取数据,初步过滤数据:
val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(5))
val kafka_brokers = ConfigurationManager.config.getString("kafka.broker.list")
val kafka_topics = ConfigurationManager.config.getString(Constants.KAFKA_TOPICS)
val kafkaParam = Map(
"bootstrap.servers" -> kafka_brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "0",
// auto.offset.reset
// latest: 先去Zookeeper获取offset,如果有,直接使用,如果没有,从最新的数据开始消费;
// earlist: 先去Zookeeper获取offset,如果有,直接使用,如果没有,从最开始的数据开始消费
// none: 先去Zookeeper获取offset,如果有,直接使用,如果没有,直接报错
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
// adRealTimeDStream: DStream[RDD RDD RDD ...] RDD[message] message: key value
val adRealTimeDStream = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(kafka_topics), kafkaParam)
)
val adReadTimeValueDStream=adRealTimeDStream.map(item=>item.value);
val adRealTimeFilterDstream=adReadTimeValueDStream.transform{
RDDS=>{
val blackList=AdBlacklistDAO.findAll();
val black=blackList.map(item=>item.userid);
RDDS.filter{
log=>{
val userId=log.split(" ")(3).toLong;
!black.contains(userId);
}
}
}
}
2.先统计每个用户的点击次数,key为time_userId_adid
val key2NumDStream=adRealTimeFilterDstream.map {
case (log)=>{
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// yy-mm-dd
val dateKey = DateUtils.formatDateKey(new Date(timeStamp))
val userId = logSplit(3).toLong
val adid = logSplit(4).toLong
val key = dateKey + "_" + userId + "_" + adid
(key, 1L)
}
}
val keyCountStream=key2NumDStream.reduceByKey(_+_);
3.更新Mysql,
keyCountStream.foreachRDD{
RDDS=>RDDS.foreachPartition{
part=>{
val clickCountArray=new ArrayBuffer[AdUserClickCount]();
for((k,v)<-part){
val keySplit = k.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
clickCountArray += AdUserClickCount(date, userId, adid, v)
}
if (clickCountArray.size>0){
flag=1;
AdUserClickCountDAO.updateBatch1(clickCountArray.toArray);
}
}
}
}
3.对keyCountStream中的每个rdd,通过查询数据库,获取点击次数,从而进行过滤操作
val filterKeyCountStream=keyCountStream.filter {
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
val clickCount = AdUserClickCountDAO.findClickCountByMultiKey(date, userId, adid)
if(clickCount > 10){
println("userID:"+userId+"is die");
true
}else{
false
}
}
}
4.将剩下的数据加入黑名单中
//4.将剩下的数据加入黑名单中
val filterBlackListDstream=filterKeyCountStream.map{
case (key,count)=>{
key.split("_")(1).toLong
}
}.transform(rdds=>rdds.distinct());
filterBlackListDstream.foreachRDD{
rdds=>rdds.foreachPartition{
part=>{
val buffer=new ListBuffer[AdBlacklist];
for(userId<-part){
buffer+=AdBlacklist(userId);
}
AdBlacklistDAO.insertBatch(buffer.toArray)
}
}
}
完整代码:
package scala
import java.util.Date
import commons.conf.ConfigurationManager
import commons.constant.Constants
import commons.model.{AdBlacklist, AdUserClickCount}
import commons.utils.DateUtils
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
object advertStat {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("adver").setMaster("local[*]").set("spark.serializer","org.apache.spark.serializer.KryoSerializer");
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate();
sparkSession.sparkContext.setLogLevel("ERROR");
// val streamingContext = StreamingContext.getActiveOrCreate(checkpointDir, func)
val ssc = new StreamingContext(sparkSession.sparkContext, Seconds(5))
val kafka_brokers = ConfigurationManager.config.getString("kafka.broker.list")
val kafka_topics = ConfigurationManager.config.getString(Constants.KAFKA_TOPICS)
val kafkaParam = Map(
"bootstrap.servers" -> kafka_brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "0",
// auto.offset.reset
// latest: 先去Zookeeper获取offset,如果有,直接使用,如果没有,从最新的数据开始消费;
// earlist: 先去Zookeeper获取offset,如果有,直接使用,如果没有,从最开始的数据开始消费
// none: 先去Zookeeper获取offset,如果有,直接使用,如果没有,直接报错
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
// adRealTimeDStream: DStream[RDD RDD RDD ...] RDD[message] message: key value
val adRealTimeDStream = KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(kafka_topics), kafkaParam)
)
val adReadTimeValueDStream=adRealTimeDStream.map(item=>item.value);
val adRealTimeFilterDstream=adReadTimeValueDStream.transform{
RDDS=>{
val blackList=AdBlacklistDAO.findAll();
val black=blackList.map(item=>item.userid);
RDDS.filter{
log=>{
val userId=log.split(" ")(3).toLong;
!black.contains(userId);
}
}
}
}
/*
需求一------实时维护黑名单
*/
generateBlackList(adRealTimeFilterDstream);
ssc.start();
ssc.awaitTermination();
}
def generateBlackList(adRealTimeFilterDstream: DStream[String]): Any = {
//1.先统计每个用户的点击次数
val key2NumDStream=adRealTimeFilterDstream.map {
case (log)=>{
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// yy-mm-dd
val dateKey = DateUtils.formatDateKey(new Date(timeStamp))
val userId = logSplit(3).toLong
val adid = logSplit(4).toLong
val key = dateKey + "_" + userId + "_" + adid
(key, 1L)
}
}
val keyCountStream=key2NumDStream.reduceByKey(_+_);
var flag=0;
//2.更新数据库
keyCountStream.foreachRDD{
RDDS=>RDDS.foreachPartition{
part=>{
val clickCountArray=new ArrayBuffer[AdUserClickCount]();
for((k,v)<-part){
val keySplit = k.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
clickCountArray += AdUserClickCount(date, userId, adid, v)
}
if (clickCountArray.size>0){
flag=1;
AdUserClickCountDAO.updateBatch1(clickCountArray.toArray);
}
}
}
}
if (flag==1){
//3.对keyCountStream中的每个rdd,通过查询数据库,获取点击次数,从而进行过滤操作
val filterKeyCountStream=keyCountStream.filter {
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
val clickCount = AdUserClickCountDAO.findClickCountByMultiKey(date, userId, adid)
if(clickCount > 10){
println("userID:"+userId+"is die");
true
}else{
false
}
}
}
//4.将剩下的数据加入黑名单中
val filterBlackListDstream=filterKeyCountStream.map{
case (key,count)=>{
key.split("_")(1).toLong
}
}.transform(rdds=>rdds.distinct());
filterBlackListDstream.foreachRDD{
rdds=>rdds.foreachPartition{
part=>{
val buffer=new ListBuffer[AdBlacklist];
for(userId<-part){
buffer+=AdBlacklist(userId);
}
AdBlacklistDAO.insertBatch(buffer.toArray)
}
}
}
}
}
}