项目实现代码
package com.junfeng.bigdata.spark.streaming
import com.ibm.icu.text.SimpleDateFormat
import com.junfeng.bigdata.spark.util.JDBCUtil
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import java.sql.{Connection, PreparedStatement, ResultSet}
import scala.collection.mutable.ListBuffer
object SparkStreaming11_Req1_BlackList1 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming")
val ssc = new StreamingContext(sparkConf, Seconds(3))
//定义kafka参数
val kafkaPara: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.1.6:9092,192.168.1.7:9092,192.168.1.8:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "atguigu",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val kafkaDataDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent, //位置策略,由框架来匹配
ConsumerStrategies.Subscribe[String, String](Set("atguigu"), kafkaPara)
)
val adClickData = kafkaDataDS.map(
kafkaData => {
val data = kafkaData.value()
val datas: Array[String] = data.split(" ")
AdClickData(datas(0), datas(1), datas(2), datas(3), datas(4))
}
)
val ds = adClickData.transform(
rdd => {
//TODO 通过JDBC来周期性获取黑名单数据
val blackList = ListBuffer[String]()
val conn: Connection = JDBCUtil.getConnection
val prepare: PreparedStatement = conn.prepareStatement("select userid from black_list")
val rs: ResultSet = prepare.executeQuery()
while (rs.next()) {
blackList.append(rs.getString(1))
}
rs.close()
prepare.close()
conn.close()
//TODO 判断点击用户是否在黑名单中
val filterRdd = rdd.filter(
data => {
!blackList.contains(data.user)fatkun
})
//TODO 如果用户不在黑名单中,那么进行统计数量(每个采集周期的)
filterRdd.map(
data => {
val sdf = new SimpleDateFormat("yyyy-MM-dd")
val day = sdf.format(new java.util.Date(data.ts.toLong))
val user = data.user
val ad = data.ad
((day, user, ad), 1)
}
)
}
)
ds.foreachRDD(
rdd => {
//rdd.foreach方法会每一条数据创建连接
//foreach方法是RDD算子,算子之外的代码是在driver端执行的,算子内的代码是在executor端执行的
//这样会涉及闭包操作,driver端的数据需要传递到executor端,需要将数据进行序列化
//数据库的连接对象是不能序列化的
//RDD提供了一个算子可以有效提升效率:foreachPartition
//可以一个分区创建一个连接对象,这样可以大幅度减少连接对象的数量,提升效率
// rdd.foreachPartition(
// iter=>{
// val conn: Connection = JDBCUtil.getConnection
// iter.foreach{
// case ((day, user, ad), count) => {
//
// }
// }
// conn.close()
// }
// )
rdd.foreach {
case ((day, user, ad), count) => {
println(s"${day} ${user} ${ad} ${count}")
if (count >= 30) {
//TODO 如果统计数量超过点击阈值(30),那么将用户拉入黑名单
val conn: Connection = JDBCUtil.getConnection
val sql =
"""
|insert into black_list(userid) value(?)
|on DUPLICATE KEY
|UPDATE userid = ?
|""".stripMargin
JDBCUtil.executeUpdate(conn,sql,Array(user,user))
conn.close()
} else {
//TODO 如果没有超过阈值,那么需要将当天的广告点击进行更新。
val conn: Connection = JDBCUtil.getConnection
val sql =
"""
|select *
|from user_ad_count
|where dt =? and userid =? and adid =?
|""".stripMargin
val flg = JDBCUtil.isExist(conn,sql,Array(day,user,ad))
//查询统计表数据
if (flg) {
//如果存在数据,那么更新
val sql2 =
"""
|update user_ad_count
|set count = count+?
|where dt =? and userid =? and adid =?
|""".stripMargin
JDBCUtil.executeUpdate(conn,sql2,Array(count,day,user,ad))
//TODO 判断更新后的点击数据是否超过阈值,如果超过那么将用户拉入黑名单。
val sql3 =
"""
|select *
|from user_ad_count
|where dt =? and userid =? and adid =? and count >=30
|""".stripMargin
val flg2 = JDBCUtil.isExist(conn,sql3,Array(day,user,ad))
if (flg2) {
val sql4 = (
"""
|insert into black_list(userid) value(?)
|on DUPLICATE KEY
|UPDATE userid = ?
|""".stripMargin)
JDBCUtil.executeUpdate(conn,sql4,Array(user,user))
}
} else {
//如果不存在数据,那么新增
val sql5 = (
"""
|insert into user_ad_count (dt,userid,adid,count) values(?,?,?,?)
|""".stripMargin)
JDBCUtil.executeUpdate(conn,sql5,Array(day,user,ad,count))
}
conn.close()
}
}
}
}
)
ssc.start()
ssc.awaitTermination()
}
//广告点击数据
case class AdClickData(ts: String, area: String, city: String, user: String, ad: String)
}
Mysql工具类
package com.junfeng.bigdata.spark.util
import com.alibaba.druid.pool.DruidDataSourceFactory
import java.sql.{Connection, PreparedStatement}
import java.util.Properties
object JDBCUtil {
//初始化连接池
val dataSource = init()
def init()={
val properties = new Properties()
properties.setProperty("driverClassName","com.mysql.jdbc.Driver")
properties.setProperty("url","jdbc:mysql://localhost:3306/spark-streaming")
properties.setProperty("username","root")
properties.setProperty("password","ljf030511")
properties.setProperty("maxActive","50")
DruidDataSourceFactory.createDataSource(properties)
}
//获取MySQL连接
def getConnection={
dataSource.getConnection
}
//执行SQL语句,单挑数据插入
def executeUpdate(connection: Connection,sql:String,params:Array[Any])={
var rtn = 0
var prepare:PreparedStatement = null
try {
connection.setAutoCommit(false)
prepare = connection.prepareStatement(sql)
if (params != null && params.length > 0) {
for (i <- params.indices) {
prepare.setObject(i + 1, params(i))
}
}
rtn = prepare.executeUpdate()
connection.commit()
prepare.close()
}catch {
case e:Exception => e.printStackTrace()
}
rtn
}
//判断一条数据是否存在
def isExist(connection:Connection,sql:String,params:Array[Any])={
var flag:Boolean = false
var prepare:PreparedStatement = null
try{
prepare = connection.prepareStatement(sql)
for(i <- params.indices){
prepare.setObject(i+1,params(i))
}
flag = prepare.executeQuery().next()
prepare.close()
}catch {
case e:Exception => e.printStackTrace()
}
flag
}
}
模拟数据生成器
package com.junfeng.bigdata.spark.streaming
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import java.util.Properties
import scala.collection.mutable.ListBuffer
import scala.util.Random
object SparkStreaming10_MockData {
def main(args: Array[String]): Unit = {
//生成模拟数据
//格式:timestamp area city userid adid
//时间戳 区域 城市 用户 广告
//Application => kafka => SparkStreaming => Analysis
val prop = new Properties()
//添加配置
prop.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"master:9092,slave1:9092,slave2:9092")
prop.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer")
prop.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String,String](prop)
while (true){
mockdata().foreach(
data =>{
//向Kafka中生成数据
val record = new ProducerRecord[String,String]("atguigu",data)
producer.send(record)
println(data)
}
)
Thread.sleep(2000)
}
}
def mockdata()={
val list = ListBuffer[String]()
val areaList = ListBuffer[String]("华北","华东","华南")
val cityList = ListBuffer[String]("北京","上海","深圳")
for(i<-1 to 30){
val area = areaList(new Random().nextInt(3))
val city = cityList(new Random().nextInt(3))
val userid = new Random().nextInt(6)+1
val adid = new Random().nextInt(6)+1
list.append(s"${System.currentTimeMillis()} ${area} ${city} ${userid} ${adid}")
}
list
}
}