1.1创建表
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for `ad_blacklist`
-- ----------------------------
DROP TABLE IF EXISTS `ad_blacklist`;
CREATE TABLE `ad_blacklist` (
`userid` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for `ad_click_trend`
-- ----------------------------
DROP TABLE IF EXISTS `ad_click_trend`;
CREATE TABLE `ad_click_trend` (
`date` varchar(30) DEFAULT NULL,
`hour` varchar(30) DEFAULT NULL,
`minute` varchar(30) DEFAULT NULL,
`adid` int(11) DEFAULT NULL,
`clickCount` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for `ad_province_top3`
-- ----------------------------
DROP TABLE IF EXISTS `ad_province_top3`;
CREATE TABLE `ad_province_top3` (
`date` varchar(30) DEFAULT NULL,
`province` varchar(100) DEFAULT NULL,
`adid` int(11) DEFAULT NULL,
`clickCount` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for `ad_stat`
-- ----------------------------
DROP TABLE IF EXISTS `ad_stat`;
CREATE TABLE `ad_stat` (
`date` varchar(30) DEFAULT NULL,
`province` varchar(100) DEFAULT NULL,
`city` varchar(100) DEFAULT NULL,
`adid` int(11) DEFAULT NULL,
`clickCount` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for `ad_user_click_count`
-- ----------------------------
DROP TABLE IF EXISTS `ad_user_click_count`;
CREATE TABLE `ad_user_click_count` (
`date` varchar(30) DEFAULT NULL,
`userid` int(11) DEFAULT NULL,
`adid` int(11) DEFAULT NULL,
`clickCount` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2.2往kafka写数据
启动kafka集群
启动MockRealTimeData类
在kafka启动消费者,如果能够消费数据,说明kafka可以使用。
kafka数据:
timestamp province city userid adid
2.3 case class
/**
* 广告黑名单
*
*/
case class AdBlacklist(userid:Long)
/**
* 用户广告点击量
* @author wuyufei
*
*/
case class AdUserClickCount(date:String,
userid:Long,
adid:Long,
clickCount:Long)
/**
* 广告实时统计
*
*/
case class AdStat(date:String,
province:String,
city:String,
adid:Long,
clickCount:Long)
/**
* 各省top3热门广告
*
*/
case class AdProvinceTop3(date:String,
province:String,
adid:Long,
clickCount:Long)
/**
* 广告点击趋势
*
*/
case class AdClickTrend(date:String,
hour:String,
minute:String,
adid:Long,
clickCount:Long)
2.4 mysql JDBC
/**
* 用户黑名单DAO类
*/
object AdBlacklistDAO {
/**
* 批量插入广告黑名单用户
*
* @param adBlacklists
*/
def insertBatch(adBlacklists: Array[AdBlacklist]) {
// 批量插入
val sql = "INSERT INTO ad_blacklist VALUES(?)"
val paramsList = new ArrayBuffer[Array[Any]]()
// 向paramsList添加userId
for (adBlacklist <- adBlacklists) {
val params: Array[Any] = Array(adBlacklist.userid)
paramsList += params
}
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 执行批量插入操作
client.executeBatch(sql, paramsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
/**
* 查询所有广告黑名单用户
*
* @return
*/
def findAll(): Array[AdBlacklist] = {
// 将黑名单中的所有数据查询出来
val sql = "SELECT * FROM ad_blacklist"
val adBlacklists = new ArrayBuffer[AdBlacklist]()
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 执行sql查询并且通过处理函数将所有的userid加入array中
client.executeQuery(sql, null, new QueryCallback {
override def process(rs: ResultSet): Unit = {
while (rs.next()) {
val userid = rs.getInt(1).toLong
adBlacklists += AdBlacklist(userid)
}
}
})
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
adBlacklists.toArray
}
}
/**
* 用户广告点击量DAO实现类
*
*/
object AdUserClickCountDAO {
def updateBatch(adUserClickCounts: Array[AdUserClickCount]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 首先对用户广告点击量进行分类,分成待插入的和待更新的
val insertAdUserClickCounts = ArrayBuffer[AdUserClickCount]()
val updateAdUserClickCounts = ArrayBuffer[AdUserClickCount]()
val selectSQL = "SELECT count(*) FROM ad_user_click_count WHERE date=? AND userid=? AND adid=? "
for (adUserClickCount <- adUserClickCounts) {
val selectParams: Array[Any] = Array(adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid)
// 根据传入的用户点击次数统计数据从已有的ad_user_click_count中进行查询
client.executeQuery(selectSQL, selectParams, new QueryCallback {
override def process(rs: ResultSet): Unit = {
// 如果能查询到并且点击次数大于0,则认为是待更新项
if (rs.next() && rs.getInt(1) > 0) {
updateAdUserClickCounts += adUserClickCount
} else {
insertAdUserClickCounts += adUserClickCount
}
}
})
}
// 执行批量插入
val insertSQL = "INSERT INTO ad_user_click_count VALUES(?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将待插入项全部加入到参数列表中
for (adUserClickCount <- insertAdUserClickCounts) {
insertParamsList += Array[Any](adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid, adUserClickCount.clickCount)
}
// 执行批量插入
client.executeBatch(insertSQL, insertParamsList.toArray)
// 执行批量更新
// clickCount=clickCount + :此处的UPDATE是进行累加
val updateSQL = "UPDATE ad_user_click_count SET clickCount=clickCount + ? WHERE date=? AND userid=? AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将待更新项全部加入到参数列表中
for (adUserClickCount <- updateAdUserClickCounts) {
updateParamsList += Array[Any](adUserClickCount.clickCount, adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid)
}
// 执行批量更新
client.executeBatch(updateSQL, updateParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
/**
* 根据多个key查询用户广告点击量
*
* @param date 日期
* @param userid 用户id
* @param adid 广告id
* @return
*/
def findClickCountByMultiKey(date: String, userid: Long, adid: Long): Int = {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
val sql = "SELECT clickCount FROM ad_user_click_count " +
"WHERE date=? " +
"AND userid=? " +
"AND adid=?"
var clickCount = 0
val params = Array[Any](date, userid, adid)
// 根据多个条件查询指定用户的点击量,将查询结果累加到clickCount中
client.executeQuery(sql, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next()) {
clickCount = rs.getInt(1)
}
}
})
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
clickCount
}
}
/**
* 广告实时统计DAO实现类
*
* @author Administrator
*
*/
object AdStatDAO {
def updateBatch(adStats: Array[AdStat]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 区分开来哪些是要插入的,哪些是要更新的
val insertAdStats = ArrayBuffer[AdStat]()
val updateAdStats = ArrayBuffer[AdStat]()
val selectSQL = "SELECT count(*) " +
"FROM ad_stat " +
"WHERE date=? " +
"AND province=? " +
"AND city=? " +
"AND adid=?"
for (adStat <- adStats) {
val params = Array[Any](adStat.date, adStat.province, adStat.city, adStat.adid)
// 通过查询结果判断当前项时待插入还是待更新
client.executeQuery(selectSQL, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next() && rs.getInt(1) > 0) {
updateAdStats += adStat
} else {
insertAdStats += adStat
}
}
})
}
// 对于需要插入的数据,执行批量插入操作
val insertSQL = "INSERT INTO ad_stat VALUES(?,?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adStat <- insertAdStats) {
insertParamsList += Array[Any](adStat.date, adStat.province, adStat.city, adStat.adid, adStat.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 对于需要更新的数据,执行批量更新操作
// 此处的UPDATE是进行覆盖
val updateSQL = "UPDATE ad_stat SET clickCount=? " +
"WHERE date=? " +
"AND province=? " +
"AND city=? " +
"AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adStat <- updateAdStats) {
updateParamsList += Array[Any](adStat.clickCount, adStat.date, adStat.province, adStat.city, adStat.adid)
}
client.executeBatch(updateSQL, updateParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
/**
* 各省份top3热门广告DAO实现类
*
* @author Administrator
*
*/
object AdProvinceTop3DAO {
def updateBatch(adProvinceTop3s: Array[AdProvinceTop3]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// dateProvinces可以实现一次去重
// AdProvinceTop3:date province adid clickCount,由于每条数据由date province adid组成
// 当只取date province时,一定会有重复的情况
val dateProvinces = ArrayBuffer[String]()
for (adProvinceTop3 <- adProvinceTop3s) {
// 组合新key
val key = adProvinceTop3.date + "_" + adProvinceTop3.province
// dateProvinces中不包含当前key才添加
// 借此去重
if (!dateProvinces.contains(key)) {
dateProvinces += key
}
}
// 根据去重后的date和province,进行批量删除操作
// 先将原来的数据全部删除
val deleteSQL = "DELETE FROM ad_province_top3 WHERE date=? AND province=?"
val deleteParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (dateProvince <- dateProvinces) {
val dateProvinceSplited = dateProvince.split("_")
val date = dateProvinceSplited(0)
val province = dateProvinceSplited(1)
val params = Array[Any](date, province)
deleteParamsList += params
}
client.executeBatch(deleteSQL, deleteParamsList.toArray)
// 批量插入传入进来的所有数据
val insertSQL = "INSERT INTO ad_province_top3 VALUES(?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将传入的数据转化为参数列表
for (adProvinceTop3 <- adProvinceTop3s) {
insertParamsList += Array[Any](adProvinceTop3.date, adProvinceTop3.province, adProvinceTop3.adid, adProvinceTop3.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
/**
* 广告点击趋势DAO实现类
*
* @author Administrator
*
*/
object AdClickTrendDAO {
def updateBatch(adClickTrends: Array[AdClickTrend]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 区分开来哪些是要插入的,哪些是要更新的
val updateAdClickTrends = ArrayBuffer[AdClickTrend]()
val insertAdClickTrends = ArrayBuffer[AdClickTrend]()
val selectSQL = "SELECT count(*) " +
"FROM ad_click_trend " +
"WHERE date=? " +
"AND hour=? " +
"AND minute=? " +
"AND adid=?"
for (adClickTrend <- adClickTrends) {
// 通过查询结果判断当前项时待插入还是待更新
val params = Array[Any](adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid)
client.executeQuery(selectSQL, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next() && rs.getInt(1) > 0) {
updateAdClickTrends += adClickTrend
} else {
insertAdClickTrends += adClickTrend
}
}
})
}
// 执行批量更新操作
// 此处的UPDATE是覆盖
val updateSQL = "UPDATE ad_click_trend SET clickCount=? " +
"WHERE date=? " +
"AND hour=? " +
"AND minute=? " +
"AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adClickTrend <- updateAdClickTrends) {
updateParamsList += Array[Any](adClickTrend.clickCount, adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid)
}
client.executeBatch(updateSQL, updateParamsList.toArray)
// 执行批量更新操作
val insertSQL = "INSERT INTO ad_click_trend VALUES(?,?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adClickTrend <- insertAdClickTrends) {
insertParamsList += Array[Any](adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid, adClickTrend.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
一、广告实时黑名单添加
import java.util.Date
import commons.conf.ConfigurationManager
import commons.constant.Constants
import commons.utils.DateUtils
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Duration, Minutes, Seconds, StreamingContext}
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
object AdverStat {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("adver").setMaster("local[*]")
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
// val streamingContext = StreamingContext.getActiveOrCreate(checkpointDir, func)
val streamingContext = new StreamingContext(sparkSession.sparkContext, Seconds(5))
val kafka_brokers = ConfigurationManager.config.getString(Constants.KAFKA_BROKERS)
val kafka_topics = ConfigurationManager.config.getString(Constants.KAFKA_TOPICS)
val kafkaParam = Map(
"bootstrap.servers" -> kafka_brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "group1",
// auto.offset.reset
// latest: 先去Zookeeper获取offset,如果有,直接使用,如果没有,从最新的数据开始消费;
// earlist: 先去Zookeeper获取offset,如果有,直接使用,如果没有,从最开始的数据开始消费
// none: 先去Zookeeper获取offset,如果有,直接使用,如果没有,直接报错
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
// adRealTimeDStream: DStream[RDD RDD RDD ...] RDD[message] message: key value
val adRealTimeDStream = KafkaUtils.createDirectStream[String, String](streamingContext,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(kafka_topics), kafkaParam)
)
// 取出了DSream里面每一条数据的value值
// adReadTimeValueDStream: Dstram[RDD RDD RDD ...] RDD[String]
// String: timestamp province city userid adid
val adReadTimeValueDStream = adRealTimeDStream.map(item => item.value())
val adRealTimeFilterDStream = adReadTimeValueDStream.transform{
logRDD =>
// blackListArray: Array[AdBlacklist] AdBlacklist: userId
val blackListArray = AdBlacklistDAO.findAll()
// userIdArray: Array[Long] [userId1, userId2, ...]
val userIdArray = blackListArray.map(item => item.userid)
logRDD.filter{
// log : timestamp province city userid adid
case log =>
val logSplit = log.split(" ")
val userId = logSplit(3).toLong
!userIdArray.contains(userId)
}
}
streamingContext.checkpoint("./spark-streaming")
adRealTimeFilterDStream.checkpoint(Duration(10000))
// 需求一:实时维护黑名单
generateBlackList(adRealTimeFilterDStream)
// 需求二:各省各城市一天中的广告点击量(累积统计)
provinceCityClickStat(adRealTimeFilterDStream)
streamingContext.start()
streamingContext.awaitTermination()
}
def provinceCityClickStat(adRealTimeFilterDStream: DStream[String]) = {
// adRealTimeFilterDStream: DStream[RDD[String]] String -> log : timestamp province city userid adid
// key2ProvinceCityDStream: DStream[RDD[key, 1L]]
val key2ProvinceCityDStream = adRealTimeFilterDStream.map{
case log =>
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// dateKey : yy-mm-dd
val dateKey = DateUtils.formatDateKey(new Date(timeStamp))
val province = logSplit(1)
val city = logSplit(2)
val adid = logSplit(4)
val key = dateKey + "_" + province + "_" + city + "_" + adid
(key, 1L)
}
val key2StateDStream = key2ProvinceCityDStream.updateStateByKey[Long]{
(values:Seq[Long], state:Option[Long]) =>
var newValue = 0L
if(state.isDefined)
newValue = state.get
for(value <- values){
newValue += value
}
Some(newValue)
}
key2StateDStream.foreachRDD{
rdd => rdd.foreachPartition{
items =>
val adStatArray = new ArrayBuffer[AdStat]()
// key: date province city adid
for((key, count) <- items){
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val city = keySplit(2)
val adid = keySplit(3).toLong
adStatArray += AdStat(date, province, city, adid, count)
}
AdStatDAO.updateBatch(adStatArray.toArray)
}
}
}
def generateBlackList(adRealTimeFilterDStream: DStream[String]) = {
// adRealTimeFilterDStream: DStream[RDD[String]] String -> log : timestamp province city userid adid
// key2NumDStream: [RDD[(key, 1L)]]
val key2NumDStream = adRealTimeFilterDStream.map{
// log : timestamp province city userid adid
case log =>
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// yy-mm-dd
val dateKey = DateUtils.formatDateKey(new Date(timeStamp))
val userId = logSplit(3).toLong
val adid = logSplit(4).toLong
val key = dateKey + "_" + userId + "_" + adid
(key, 1L)
}
val key2CountDStream = key2NumDStream.reduceByKey(_+_)
// 根据每一个RDD里面的数据,更新用户点击次数表
key2CountDStream.foreachRDD{
rdd => rdd.foreachPartition{
items =>
val clickCountArray = new ArrayBuffer[AdUserClickCount]()
for((key, count) <- items){
val keySplit = key.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
clickCountArray += AdUserClickCount(date, userId, adid, count)
}
AdUserClickCountDAO.updateBatch(clickCountArray.toArray)
}
}
// key2BlackListDStream: DStream[RDD[(key, count)]]
val key2BlackListDStream = key2CountDStream.filter{
case (key, count) =>
val keySplit = key.split("_")
val date = keySplit(0)
val userId = keySplit(1).toLong
val adid = keySplit(2).toLong
val clickCount = AdUserClickCountDAO.findClickCountByMultiKey(date, userId, adid)
if(clickCount > 100){
true
}else{
false
}
}
// key2BlackListDStream.map: DStream[RDD[userId]]
val userIdDStream = key2BlackListDStream.map{
case (key, count) => key.split("_")(1).toLong
}.transform(rdd => rdd.distinct())
userIdDStream.foreachRDD{
rdd => rdd.foreachPartition{
items =>
val userIdArray = new ArrayBuffer[AdBlacklist]()
for(userId <- items){
userIdArray += AdBlacklist(userId)
}
AdBlacklistDAO.insertBatch(userIdArray.toArray)
}
}
}
}
二、各省各城市一天中的广告点击量
def provinceCityClickStat(adRealTimeFilterDStream: DStream[String]) = {
// adRealTimeFilterDStream: DStream[RDD[String]] String -> log : timestamp province city userid adid
// key2ProvinceCityDStream: DStream[RDD[key, 1L]]
val key2ProvinceCityDStream = adRealTimeFilterDStream.map{
case log =>
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// dateKey : yy-mm-dd
val dateKey = DateUtils.formatDateKey(new Date(timeStamp))
val province = logSplit(1)
val city = logSplit(2)
val adid = logSplit(4)
val key = dateKey + "_" + province + "_" + city + "_" + adid
(key, 1L)
}
// key2StateDStream: 某一天一个省的一个城市中某一个广告的点击次数(累积)
val key2StateDStream = key2ProvinceCityDStream.updateStateByKey[Long]{
(values:Seq[Long], state:Option[Long]) =>
var newValue = 0L
if(state.isDefined)
newValue = state.get
for(value <- values){
newValue += value
}
Some(newValue)
}
key2StateDStream.foreachRDD{
rdd => rdd.foreachPartition{
items =>
val adStatArray = new ArrayBuffer[AdStat]()
// key: date province city adid
for((key, count) <- items){
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val city = keySplit(2)
val adid = keySplit(3).toLong
adStatArray += AdStat(date, province, city, adid, count)
}
AdStatDAO.updateBatch(adStatArray.toArray)
}
}
key2StateDStream
}
main方法:
// 需求二:各省各城市一天中的广告点击量(累积统计)
val key2ProvinceCityCountDStream = provinceCityClickStat(adRealTimeFilterDStream)
三、统计各省Top3热门广告
def proveinceTope3Adver(sparkSession: SparkSession,
key2ProvinceCityCountDStream: DStream[(String, Long)]) = {
// key2ProvinceCityCountDStream: [RDD[(key, count)]]
// key: date_province_city_adid
// key2ProvinceCountDStream: [RDD[(newKey, count)]]
// newKey: date_province_adid
val key2ProvinceCountDStream = key2ProvinceCityCountDStream.map{
case (key, count) =>
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(3)
val newKey = date + "_" + province + "_" + adid
(newKey, count)
}
val key2ProvinceAggrCountDStream = key2ProvinceCountDStream.reduceByKey(_+_)
val top3DStream = key2ProvinceAggrCountDStream.transform{
rdd =>
// rdd:RDD[(key, count)]
// key: date_province_adid
val basicDateRDD = rdd.map{
case (key, count) =>
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(2).toLong
(date, province, adid, count)
}
import sparkSession.implicits._
basicDateRDD.toDF("date", "province", "adid", "count").createOrReplaceTempView("tmp_basic_info")
val sql = "select date, province, adid, count from(" +
"select date, province, adid, count, " +
"row_number() over(partition by date,province order by count desc) rank from tmp_basic_info) t " +
"where rank <= 3"
sparkSession.sql(sql).rdd
}
top3DStream.foreachRDD{
// rdd : RDD[row]
rdd =>
rdd.foreachPartition{
// items : row
items =>
val top3Array = new ArrayBuffer[AdProvinceTop3]()
for(item <- items){
val date = item.getAs[String]("date")
val province = item.getAs[String]("province")
val adid = item.getAs[Long]("adid")
val count = item.getAs[Long]("count")
top3Array += AdProvinceTop3(date, province, adid, count)
}
AdProvinceTop3DAO.updateBatch(top3Array.toArray)
}
}
}
main函数:
// 需求三:统计各省Top3热门广告
proveinceTope3Adver(sparkSession, key2ProvinceCityCountDStream)
streamingContext.start()
streamingContext.awaitTermination()
四、最近一个小时的实时广告点击量统计
def getRecentHourClickCount(adRealTimeFilterDStream: DStream[String]) = {
val key2TimeMinuteDStream = adRealTimeFilterDStream.map{
// log: timestamp province city userId adid
case log =>
val logSplit = log.split(" ")
val timeStamp = logSplit(0).toLong
// yyyyMMddHHmm
val timeMinute = DateUtils.formatTimeMinute(new Date(timeStamp))
val adid = logSplit(4).toLong
val key = timeMinute + "_" + adid
(key, 1L)
}
val key2WindowDStream = key2TimeMinuteDStream.reduceByKeyAndWindow((a:Long, b:Long)=>(a+b), Minutes(60), Minutes(1))
key2WindowDStream.foreachRDD{
rdd => rdd.foreachPartition{
// (key, count)
items=>
val trendArray = new ArrayBuffer[AdClickTrend]()
for((key, count) <- items){
val keySplit = key.split("_")
// yyyyMMddHHmm
val timeMinute = keySplit(0)
val date = timeMinute.substring(0, 8)
val hour = timeMinute.substring(8,10)
val minute = timeMinute.substring(10)
val adid = keySplit(1).toLong
trendArray += AdClickTrend(date, hour, minute, adid, count)
}
AdClickTrendDAO.updateBatch(trendArray.toArray)
}
}
}
main函数:
// 需求四:最近一个小时广告点击量统计
getRecentHourClickCount(adRealTimeFilterDStream)