1、统计各省市的数据分布情况
需求:统计日志中,每个省市的数据记录条数
分析:使用spark SQL进行实现,实现的结果存储到MySQL,存json文件
引入pom文件
<!--gson|jackson|....-->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.4</version>
</dependency>
直接上代码
dolphin-doit01\src\main\scala\cn\sheep\dolphin\report\ProvinceCityAnalysisCore.scala
package cn.sheep.dolphin.report
import cn.sheep.dolphin.common.DolphinAppComm
import cn.sheep.dolphin.config.DolphinConfig
import com.google.gson.Gson
import org.apache.spark.sql.{SQLContext, SaveMode}
/**
* 统计各省市数据分布情况的 - Core
* author: old sheep
* Created 2021/04/04 12:22
*/
object ProvinceCityAnalysisCore {
def main(args: Array[String]): Unit = {
if (args.length != 2) {
println(
"""
|Usage: cn.sheep.dolphin.report.ProvinceCityAnalysisCore
|参数:<parquetInputPath> <jsonOutputPath>
""".stripMargin)
sys.exit(-1)
}
// 接受参数
val Array(parquetInputPath, jsonOutputPath) = args
// sparkContext
val sc = DolphinAppComm.createSparkContext("统计各省市数据分布情况的")
// parquet <- SQLContext
val sqlc = new SQLContext(sc)
val dataFrame = sqlc.read.parquet(parquetInputPath)
// 统计数据
val result = dataFrame.map(row => {
// 日期
val day = row.getAs[String]("requestdate").substring(0, 10)
// 省份
val province = row.getAs[String]("provincename")
// 地市
val city = row.getAs[String]("cityname")
((day, province, city), 1)
}).reduceByKey(_ + _)
// 存储-MySQL
import sqlc.implicits._
val resultDF = result
.map { case ((day, province, city), count) => (day, province, city, count) }
.toDF("day", "province", "city", "cnt")
resultDF.write.mode(SaveMode.Overwrite).jdbc(DolphinConfig._url, "r_procity_analysis", DolphinConfig.props)
// 存储-JSON
// 第一种方式
// resultDF.write.json()
// 第二种方式 case class => Schema
result.map { case ((day, province, city), count) => {
val gson = new Gson()
gson.toJson(RProcityAnalysis(day, province, city, count))
}}.saveAsTextFile(jsonOutputPath)
sc.stop()
}
}
/*封装结果数据样例对类*/
case class RProcityAnalysis(day: String, province: String, city: String, cnt: Int)
启动程序之前需要配置传参:
C:\Users\mrman\dolphin-doit01\parquet
C:\Users\mrman\dolphin-doit01\provinceCityAnalysisCore
运行ProvinceCityAnalysisCore程序,在自己的磁盘可以查看输出的数据
查看输出的part-00000文件数据信息
由于是对mysql的数据进行重写操作,所以可以在mysql的spark数据库查看
总结:
(1)在生产环境下,对应程序运行时的参数,一般都是args传递的,这样做的灵活性会比较好;对于不可变的数据集,实际上是可以放到配置文件中的,比如字段数据;
(2)无论是SQL还是Core的实现方式,其实思想是一样的,永远都是要找到你的统计需求里面的维度字段是什么,即就是SQL中的group by的字段,Core中就是reduceByKey的Key;
(3)通过SQLContext可以获得DataFrame对象,构建DataFrame的几种方式
a.RDD[Row] + StructType(Seq(StructField......))
b.RDD + case class =>toDF
RDD[A <: Product]
(4)fastjson对scala样例类支持不是特别好,建议大家在scala中将对象转换成json时使用Gson|Jackson
2、地域指标统计
省市 | 原始请求 | 有效请求 | 广告请求 | 参与竞价 | 竞价成功 | 广告展示 | 广告点击 | 广告消费 | 广告成本 |
2.1、需求:
2.2、要求
-
数据存储到MySQL中,省市数据分成2个字段存储;
-
需求的实现采用两种方式进行实现(SQL|Core);
-
结果要求一个job完成;
-
需要按天统计
sparkcore方式实现
直接上代码
dolphin-doit01\src\main\scala\cn\sheep\dolphin\bean\AreaReportResult.scala
package cn.sheep.dolphin.bean
/**
* author: old sheep
* Created 2021/04/04 16:13
*/
case class AreaReportResult(
day: String,
provinceName: String,
cityName: String,
rawRequest: Int,
effectiveRequest: Int,
adRequest: Int,
isRTB: Int,
succRTB: Int,
adShow: Int,
adClick: Int,
expense: Double,
cost: Double
)
dolphin-doit01\src\main\scala\cn\sheep\dolphin\report\AreaAnalysisCore.scala
package cn.sheep.dolphin.report
import cn.sheep.dolphin.bean.AreaReportResult
import cn.sheep.dolphin.common.DolphinAppComm
import cn.sheep.dolphin.config.DolphinConfig
import org.apache.spark.sql.{Row, SQLContext, SaveMode}
/**
* 地域指标统计 - core
* author: old sheep
* Created 2021/04/04 14:25
*/
object AreaAnalysisCore {
def main(args: Array[String]): Unit = {
if (args.size != 1) {
println(
"""
|Usage: cn.sheep.dolphin.report.AreaAnalysisCore
| <parquetInputPath>
""".stripMargin)
sys.exit(-1)
}
val Array(parquetInputPath) = args
val sc = DolphinAppComm.createSparkContext("地域指标统计")
val sQLContext = new SQLContext(sc)
// 读取数据parquet
val dataFrame = sQLContext.read.parquet(parquetInputPath)
import sQLContext.implicits._
dataFrame.map(row =>{
//获取省市及日期
val day = row.getAs[String]("requestdate").substring(0, 10) //请求时间,格式为:yyyy-m-dd hh:mm:ss
val proName = row.getAs[String]("provincename") //设备所在省份名称
val cityName = row.getAs[String]("cityname") //设备所在城市名称
//提取判断字段的依据
val requestMode = row.getAs[Int]("requestmode") //数据请求方式(1:请求、2:展示、3:点击)
val processNode = row.getAs[Int]("processnode") //流程节点(1:请求量kpi 2:有效请求 3:广告请)
val effective = row.getAs[Int]("iseffective") //有效标识(有效指可以正常计费的)(0:无效 1:有效)
val billing = row.getAs[Int]("isbilling") //是否收费(0:未收费1:已收费)
val isBid = row.getAs[Int]("isbid") //是否rtb
val isWin = row.getAs[Int]("iswin") //是否竞价成功
val adOrderId = row.getAs[Int]("adorderid") //广告id
val winPrice = row.getAs[Double]("winprice") //竞价成功价格
val adPayment = row.getAs[Double]("adpayment") //转换后的广告消费
//原始请求、有效请求、广告请求
val (rawRequest,effectiveRequest,adRequest)=if (requestMode == 1 && processNode >=3)(1,1,1)
else if (requestMode ==1 && processNode ==2)(1,1,0)
else if (requestMode ==1 && processNode ==1)(1,0,0)
else(0,0,0)
//是否参与竞价的日志
val isRBT=if(effective ==1 && billing ==1 && isBid ==1 && adOrderId !=0) 1 else 0
//是竞价成功的日志吗
val (succRTB,expense,cost) = if (effective == 1 && billing == 1 && isWin == 1) {
val winPrice = row.getAs[Double]("winprice")/1000d //竞价成功价格
val adPayment = row.getAs[Double]("adpayment")/1000d //转换后的广告消费
(1,winPrice,adPayment)
} else (0,0d,0d)
//(广告的展示、广告点击)
val (adShow,adClick) = if (requestMode == 2 && effective == 1) (1, 0)
else if (requestMode == 3 && effective ==1)(0,1)
else(0,0)
//组装数据格式(k,v) => (day,proName,cityName),(原始请求、有效请求、广告请求、参与竞价、竞价成功、广告的展示、广告点击)
(
(day,proName,cityName),
(rawRequest,effectiveRequest,adRequest,isRBT,succRTB,adShow,adClick,expense,cost)
)
}).reduceByKey((a,b) =>{
(
a._1 + b._1,
a._2 + b._2,
a._3 + b._3,
a._4 + b._4,
a._5 + b._5,
a._6 + b._6,
a._7 + b._7,
a._8 + b._8,
a._9 + b._9
)
}).map{
case ((day, pname, cname),(rawReq, effReq, adReq, isRTB, succRTB, adShow, adClick, expense, cost)) =>
AreaReportResult(
day, pname, cname,
rawReq.toInt, effReq.toInt, adReq.toInt,
isRTB.toInt, succRTB.toInt, adShow.toInt,
adClick.toInt, expense, cost)
}
.toDF()
.write.mode(SaveMode.Append)
.jdbc(DolphinConfig._url, "r_area_analysis", DolphinConfig.props)
sc.stop()
}
}
运行AreaAnalysisCore程序,控制台打印输出
在mysql查看spark数据库发现多了一张r_area_analysis数据表
代码优化改造
package cn.sheep.dolphin.report
import cn.sheep.dolphin.bean.AreaReportResult
import cn.sheep.dolphin.common.DolphinAppComm
import cn.sheep.dolphin.config.DolphinConfig
import org.apache.spark.sql.{SQLContext, SaveMode}
/**
* 地域指标统计 - core【代码优化改造后】
* author: old sheep
* Created 2021/04/04 19:23
*/
object AreaAnalysisCore {
def main(args: Array[String]): Unit = {
if (args.size != 1) {
println(
"""
|Usage: cn.sheep.dolphin.report.AreaAnalysisCore
| <parquetInputPath>
""".stripMargin)
sys.exit(-1)
}
val Array(parquetInputPath) = args
val sc = DolphinAppComm.createSparkContext("地域指标统计")
val sQLContext = new SQLContext(sc)
// 读取数据parquet
val dataFrame = sQLContext.read.parquet(parquetInputPath)
import sQLContext.implicits._
dataFrame.map(row =>{
//获取省市及日期
val day = row.getAs[String]("requestdate").substring(0, 10) //请求时间,格式为:yyyy-m-dd hh:mm:ss
val proName = row.getAs[String]("provincename") //设备所在省份名称
val cityName = row.getAs[String]("cityname") //设备所在城市名称
//提取判断字段的依据
val requestMode = row.getAs[Int]("requestmode") //数据请求方式(1:请求、2:展示、3:点击)
val processNode = row.getAs[Int]("processnode") //流程节点(1:请求量kpi 2:有效请求 3:广告请)
val effective = row.getAs[Int]("iseffective") //有效标识(有效指可以正常计费的)(0:无效 1:有效)
val billing = row.getAs[Int]("isbilling") //是否收费(0:未收费1:已收费)
val isBid = row.getAs[Int]("isbid") //是否rtb
val isWin = row.getAs[Int]("iswin") //是否竞价成功
val adOrderId = row.getAs[Int]("adorderid") //广告id
val winPrice = row.getAs[Double]("winprice") //竞价成功价格
val adPayment = row.getAs[Double]("adpayment") //转换后的广告消费
//原始请求、有效请求、广告请求
val (rawRequest,effectiveRequest,adRequest)=if (requestMode == 1 && processNode >=3)(1,1,1)
else if (requestMode ==1 && processNode ==2)(1,1,0)
else if (requestMode ==1 && processNode ==1)(1,0,0)
else(0,0,0)
//是否参与竞价的日志
val isRBT=if(effective ==1 && billing ==1 && isBid ==1 && adOrderId !=0) 1 else 0
//是竞价成功的日志吗
val (succRTB,expense,cost) = if (effective == 1 && billing == 1 && isWin == 1) {
val winPrice = row.getAs[Double]("winprice")/1000d //竞价成功价格
val adPayment = row.getAs[Double]("adpayment")/1000d //转换后的广告消费
(1,winPrice,adPayment)
} else (0,0d,0d)
//(广告的展示、广告点击)
val (adShow,adClick) = if (requestMode == 2 && effective == 1) (1, 0)
else if (requestMode == 3 && effective ==1)(0,1)
else(0,0)
//组装数据格式(k,v) => (day,proName,cityName),(原始请求、有效请求、广告请求、参与竞价、竞价成功、广告的展示、广告点击)
(
(day,proName,cityName),
List (rawRequest,effectiveRequest,adRequest,isRBT,succRTB,adShow,adClick,expense,cost)
)
}).reduceByKey((a,b) => {
a.zip(b).map(tp => tp._1 + tp._2)
}).map{
case ((day, pname, cname),List(rawReq, effReq, adReq, isRTB, succRTB, adShow, adClick, expense, cost)) =>
AreaReportResult(
day, pname, cname,
rawReq.toInt, effReq.toInt, adReq.toInt,
isRTB.toInt, succRTB.toInt, adShow.toInt,
adClick.toInt, expense, cost)
}
.toDF()
.write.mode(SaveMode.Append)
.jdbc(DolphinConfig._url, "r_area_analysis", DolphinConfig.props)
sc.stop()
}
}
sparkSql方式实现
package cn.sheep.dolphin.report
import cn.sheep.dolphin.common.DolphinAppComm
import cn.sheep.dolphin.config.DolphinConfig
import org.apache.spark.sql.{SQLContext, SaveMode}
/** 地域指标统计 - sql
* author: old sheep
* Created 2021/04/04 21:30
*/
object AreaAnalysisSQL {
def main(args: Array[String]): Unit = {
if (args.size != 1) {
println(
"""
|Usage: cn.sheep.dolphin.report.AreaAnalysisSQL
| <parquetInputPath>
""".stripMargin)
sys.exit(-1)
}
val Array(parquetInputPath) = args
val sc = DolphinAppComm.createSparkContext("地域指标统计")
val sQLContext = new SQLContext(sc)
// 读取数据parquet
val dataFrame = sQLContext.read.parquet(parquetInputPath)
// 将dataFrame注册成临时表
dataFrame.registerTempTable("adlogs")
// spark sql提供了udf功能
sQLContext.udf.register("sheepif", (bool: Boolean, right: Double, wrong: Double) => if (bool) right else wrong)
// 写sql查表
val result = sQLContext.sql(
"""
|select provincename, cityname, substring(requestdate, 0, 10) day,
|sum(case when requestmode=1 and processnode>=1 then 1 else 0 end) rawReq,
|sum(case when requestmode=1 and processnode>=2 then 1 else 0 end) effReq,
|sum(case when requestmode=1 and processnode=3 then 1 else 0 end) adReq,
|
|sum(sheepif(iseffective=1 and isbilling=1 and isbid=1 and adorderid!=0, 1, 0)) isRTB,
|sum(if(iseffective=1 and isbilling=1 and iswin=1, 1, 0)) succRTB,
|
|sum(if(requestmode=2 and iseffective=1, 1, 0)) adShow,
|sum(if(requestmode=3 and iseffective=1, 1, 0)) adClik,
|
|sum(if(iseffective=1 and isbilling=1 and iswin=1, winprice/1000, 0)) expense,
|sum(if(iseffective=1 and isbilling=1 and iswin=1, adpayment/1000, 0)) cost
|
|from adlogs
|group by provincename, cityname, substring(requestdate, 0, 10)
|
""".stripMargin)
// 将结果写入到数据库中
result.write.mode(SaveMode.Overwrite).jdbc(DolphinConfig._url, "r_area_analysis2", DolphinConfig.props)
sc.stop()
}
}
运行AreaAnalysisSQL程序,控制台打印输出
在mysql查看spark数据库发现多了一张r_area_analysis2数据表
若想在控制台查看数据,直接show方法即可:
// 将结果写入到数据库中
// result.write.mode(SaveMode.Overwrite).jdbc(DolphinConfig._url, "r_area_analysis2", DolphinConfig.props)
result.show()
控制台打印输出
查看spark的地域指标的Web UI 页面