做什么?
根据需求七中统计的各省各城市累计广告点击量,创建SparkSQL临时表,通过SQL查询的形式获取各省的Top3热门广告。
需求分析
- 在需求七中,我们已经实时统计了各省各城市广告的点击量,并且key的格式是(date_province_city_adid),
- 现在我们只需要把Key变为(date_province_city),再用reduceByKey进行累加
- 最后利用sparkSQL创建临时表,进行排序即可
步骤分析:
1.转化key的格式
//1.转化key为date_province_adid,value仍然是原本的count
val key2ProvinceCountDStream=key2ProvinceCityCountDStream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(3)
(date+"_"+province+"_"+adid,count);
}
}
2.累加,创建临时表
//2.累增,创建临时表
val key2ProvinceAggCountDStream=key2ProvinceCountDStream.reduceByKey(_+_);
val top3DStream=key2ProvinceAggCountDStream.transform{
stream=>{
val temp=stream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(2).toLong
(date, province, adid, count)
}
}
import sparkSession.implicits._;
temp.toDF("date","province","adid","count").createOrReplaceTempView("tmp_basic_info");
val sql = "select date, province, adid, count from(" +
"select date, province, adid, count, " +
"row_number() over(partition by date,province order by count desc) rank from tmp_basic_info) " +
"where rank <= 3"
sparkSession.sql(sql).rdd;
}
}
4.封装,写入数据库
//3.数据封装
top3DStream.foreachRDD{
// rdd : RDD[row]
rdd =>
rdd.foreachPartition{
// items : row
items =>
val top3Array = new ArrayBuffer[AdProvinceTop3]()
for(item <- items){
val date = item.getAs[String]("date")
val province = item.getAs[String]("province")
val adid = item.getAs[Long]("adid")
val count = item.getAs[Long]("count")
top3Array += AdProvinceTop3(date, province, adid, count)
}
top3Array.foreach(println);
//AdProvinceTop3DAO.updateBatch(top3Array.toArray)
}
}
完整代码
def proveinceTope3Adver(sparkSession: SparkSession,
key2ProvinceCityCountDStream: DStream[(String, Long)])={
//1.转化key为date_province_adid,value仍然是原本的count
val key2ProvinceCountDStream=key2ProvinceCityCountDStream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(3)
(date+"_"+province+"_"+adid,count);
}
}
//2.累增,创建临时表
val key2ProvinceAggCountDStream=key2ProvinceCountDStream.reduceByKey(_+_);
val top3DStream=key2ProvinceAggCountDStream.transform{
stream=>{
val temp=stream.map{
case (key,count)=>{
val keySplit = key.split("_")
val date = keySplit(0)
val province = keySplit(1)
val adid = keySplit(2).toLong
(date, province, adid, count)
}
}
import sparkSession.implicits._;
temp.toDF("date","province","adid","count").createOrReplaceTempView("tmp_basic_info");
val sql = "select date, province, adid, count from(" +
"select date, province, adid, count, " +
"row_number() over(partition by date,province order by count desc) rank from tmp_basic_info) " +
"where rank <= 3"
sparkSession.sql(sql).rdd;
}
}
//3.数据封装
top3DStream.foreachRDD{
// rdd : RDD[row]
rdd =>
rdd.foreachPartition{
// items : row
items =>
val top3Array = new ArrayBuffer[AdProvinceTop3]()
for(item <- items){
val date = item.getAs[String]("date")
val province = item.getAs[String]("province")
val adid = item.getAs[Long]("adid")
val count = item.getAs[Long]("count")
top3Array += AdProvinceTop3(date, province, adid, count)
}
top3Array.foreach(println);
//AdProvinceTop3DAO.updateBatch(top3Array.toArray)
}
}
}