package com.weblog.cn
import java.sql.{Connection, PreparedStatement}import scala.collection.mutable.ListBuffer
/** 各个维度统计 DAO 操作
**/object StatDAO {/** 批量保存 DayVideoAccessStat 到数据库
**/definsertDayVideoAccessTopN(list: ListBuffer[DayVideoAccessStat])={
var connection: Connection = null
var pstmt: PreparedStatement = null
try{
connection = MySQLUtils.getConnection()
connection.setAutoCommit(false)//设置手动提交
val sql ="insert into day_video_access_topn_stat(day,cms_id,times) values (?,?,?) "
pstmt = connection.prepareStatement(sql)for(ele <-list){
pstmt.setString(1,ele.day)
pstmt.setLong(2,ele.cmsId)
pstmt.setLong(3,ele.times)
pstmt.addBatch()}
pstmt.executeBatch()//执行批量处理
connection.commit()//手动提交
} catch {
case e: Exception => e.printStackTrace()}finally{
MySQLUtils.release(connection, pstmt)}}/** 批量保存 DayCityVideoAccessStat 到数据库
**/definsertDayCityVideoAccessTopN(list: ListBuffer[DayCityVideoAccessStat])={
var connection: Connection = null
var pstmt: PreparedStatement = null
try{
connection = MySQLUtils.getConnection()
connection.setAutoCommit(false)//设置手动提交
val sql ="insert into day_video_city_access_topn_stat(day,cms_id,city,times,times_rank) values (?,?,?,?,?) "
pstmt = connection.prepareStatement(sql)for(ele <-list){
pstmt.setString(1,ele.day)
pstmt.setLong(2,ele.cmsId)
pstmt.setString(3,ele.city)
pstmt.setLong(4,ele.times)
pstmt.setInt(5,ele.timesRank)
pstmt.addBatch()}
pstmt.executeBatch()//执行批量处理
connection.commit()//手动提交
} catch {
case e: Exception => e.printStackTrace()}finally{
MySQLUtils.release(connection, pstmt)}}/** 批量保存 DayVideoTrafficsStat 到数据库
**/definsertDayVideoTrafficsAccessTopN(list: ListBuffer[DayVideoTrafficsStat])={
var connection: Connection = null
var pstmt: PreparedStatement = null
try{
connection = MySQLUtils.getConnection()
connection.setAutoCommit(false)//设置手动提交
val sql ="insert into day_video_traffics_topn_stat(day,cms_id,traffics) values (?,?,?) "
pstmt = connection.prepareStatement(sql)for(ele <-list){
pstmt.setString(1,ele.day)
pstmt.setLong(2,ele.cmsId)
pstmt.setLong(3,ele.traffics)
pstmt.addBatch()}
pstmt.executeBatch()//执行批量处理
connection.commit()//手动提交
} catch {
case e: Exception => e.printStackTrace()}finally{
MySQLUtils.release(connection, pstmt)}}}
2.3 TopNStatJob.scala
package com.weblog.cn
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SparkSession}import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
object TopNStatJob {defmain(args: Array[String]): Unit ={
val spark = SparkSession.builder().appName("TopNStatJobApp").config("spark.sql.sources.partitionColumnTypeInference.enabled","false").master("local[2]").getOrCreate()
val accessDF = spark.read.format("parquet").load("d://weblog_clean")// accessDF.printSchema()// accessDF.show(false)//最受欢迎的 TopN 课程
//videoAccessTopNStat(spark, accessDF)//按照地市进行统计TopN课程
//cityAccessTopNStat(spark, accessDF)
videoTrafficTopNStat(spark,accessDF)
spark.stop()}/** 按照流量统计TopN
**/defvideoTrafficTopNStat(spark: SparkSession, accessDF: DataFrame): Unit ={import spark.implicits._
val trafficTopNDF = accessDF.filter($"day"==="20170511"&& $"cmsType"==="video").groupBy("day","cmsId").agg(sum("traffic").as("traffics")).orderBy($"traffics".desc)//.show(false)/** 将统计结果写入到 MySQL
**/try{
trafficTopNDF.foreachPartition(partitionOfRecords =>{
val list= new ListBuffer[DayVideoTrafficsStat]
partitionOfRecords.foreach(info =>{
val day = info.getAs[String]("day")
val cmsId = info.getAs[Long]("cmsId")
val traffics = info.getAs[Long]("traffics")list.append( DayVideoTrafficsStat(day, cmsId,traffics))})
StatDAO.insertDayVideoTrafficsAccessTopN(list)})} catch {
case e: Exception => e.printStackTrace()}}/***按照地市进行统计TopN课程
**/defcityAccessTopNStat(spark: SparkSession, accessDF: DataFrame)={import spark.implicits._
val cityAccessTopNDF = accessDF.filter($"day"==="20170511"&& $"cmsType"==="video").groupBy("day","cmsId","city").agg(count("cmsId").as("times"))//cityAccessTopNDF.show(false)/** Window 函数在 Spark SQL 中的使用
**/
val top3DF = cityAccessTopNDF.select(
cityAccessTopNDF("day"),
cityAccessTopNDF("city"),
cityAccessTopNDF("cmsId"),
cityAccessTopNDF("times"),
row_number().over(Window.partitionBy(cityAccessTopNDF("city")).orderBy(cityAccessTopNDF("city").desc)).as("times_rank")).filter("times_rank <= 3")//.show(false)//统计每个地市 Top3
/** 将统计结果写入到 MySQL
**/try{
top3DF.foreachPartition(partitionOfRecords =>{
val list= new ListBuffer[DayCityVideoAccessStat]
partitionOfRecords.foreach(info =>{
val day = info.getAs[String]("day")
val cmsId = info.getAs[Long]("cmsId")
val city = info.getAs[String]("city")
val times = info.getAs[Long]("times")
val timesRank = info.getAs[Int]("times_rank")list.append(DayCityVideoAccessStat(day, cmsId, city, times, timesRank))})
StatDAO.insertDayCityVideoAccessTopN(list)})} catch {
case e: Exception => e.printStackTrace()}}/** 最受欢迎的 TopN课程
**/defvideoAccessTopNStat(spark: SparkSession, accessDF: DataFrame)={//DataFrame 方式
/*import spark.implicits._
val videoAccessTopNDF = accessDF.filter($"day"==="20170511"&& $"cmsType"==="video").groupBy("day","cmsId").agg(count("cmsId").as("times")).orderBy($"times".desc)
videoAccessTopNDF.show(false)*///使用SQL 方式统计
accessDF.createOrReplaceTempView("access_logs")
val videoAccessTopNDF = spark.sql("select day,cmsId, count(1) as times from access_logs where day='20170511' and cmsType='video'"+" group by day,cmsId order by times desc")//videoAccessTopNDF.show(false)/*** 将统计结果写入到 MySQL中
**/try{
videoAccessTopNDF.foreachPartition(partitionOfRecords =>{
val list= new ListBuffer[DayVideoAccessStat]
partitionOfRecords.foreach(info =>{
val day = info.getAs[String]("day")
val cmsId = info.getAs[Long]("cmsId")
val times = info.getAs[Long]("times")list.append(DayVideoAccessStat(day, cmsId, times))})
StatDAO.insertDayVideoAccessTopN(list)})} catch {
case e: Exception => e.printStackTrace()}}}