在项目中使用Redis做缓存文件(目的等同于广播变量):
package com.app
import com.utils.{JedisConnectionPool, RptUtils}
import org.apache.commons.lang.StringUtils
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* 使用redis存放字典文件
*/
object AppRpt2 {
def main(args: Array[String]): Unit = {
if(args.length != 3){
println("目录不存在,请重新输入")
sys.exit()
}
val Array(inputPath,ouputPath,resultPath) = args
val conf = new SparkConf().setAppName(s"${this.getClass.getName}").setMaster("local[*]")
//设置spark序列化方式
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(conf)
val sQLContext = new SQLContext(sc)
//配置压缩格式
sQLContext.setConf("spark.sql.parquet.compression.codec", "snappy")
//读取字典文件
val dicMap = sc.textFile(resultPath).map(_.split("\t",-1)).filter(_.length>=5)
.map(arr=>{
// com.123.cn 爱奇艺
(arr(4),arr(1))
})
//将字典文件存到Redis中
dicMap.foreachPartition(part=>{
val jedis = JedisConnectionPool.getConnection()
part.foreach(t=>{
//存redis ip nams
jedis.set(t._1,t._2)
})
})
//读取parquet文件
val df = sQLContext.read.parquet(inputPath)
df.mapPartitions(maps=>{
val jedis = JedisConnectionPool.getConnection()
maps.map(row=> {
var appname = row.getAs[String]("appname")
//广播变量对比 redis缓存 从redis缓存读取
if (!StringUtils.isNotBlank(appname)) {
//如果取到的值是null 则用他的id去字典表里得到name
//appname = broadcast.value.getOrElse("appid","unknow")
val appid = row.getAs[String]("appid")
appname = jedis.get(appid)
}
//把需要的字段拿出来
// 原始请求数,有效请求数,广告请求数
val requestmode = row.getAs[Int]("requestmode")
val processnode = row.getAs[Int]("processnode")
val iseffective = row.getAs[Int]("iseffective")
// 参与竞价数 竞价成功数,展示数,点击数
val isbilling = row.getAs[Int]("isbilling")
val isbid = row.getAs[Int]("isbid")
val iswin = row.getAs[Int]("iswin")
val adorderid = row.getAs[Int]("adorderid")
// 广告费用 广告成本费用
val winPrice = row.getAs[Double]("winprice")
val adpayment = row.getAs[Double]("adpayment")
//调用业务的方法
val reqlist = RptUtils.calculateReq(requestmode, processnode)
val rtblist = RptUtils.calculateRtb(iseffective, isbilling, isbid, iswin, adorderid, winPrice, adpayment)
val cliklist = RptUtils.calculateTimes(requestmode, iseffective)
(appname, reqlist ++ rtblist ++ cliklist)
})
}).reduceByKey((list1,list2)=>{
// list(0,2,1,5) list(2,5,4,7) zip((0,2),(2,5),(1,4),(5,7))
list1.zip(list2).map(t=>t._1+t._2)
}).map(t=>{
t._1+","+t._2.mkString(",")
}).take(10).toBuffer.foreach(println)
}
}
赠:利用广播变量广播小文件:
package com.app
import com.utils.RptUtils
import org.apache.commons.lang.StringUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
/**
* 使用广播变量broadcast广播小文件
*/
object AppRpt {
def main(args: Array[String]): Unit = {
if (args.length != 3) {
println("目录不存在,请重新输入")
sys.exit()
}
val Array(inputPath, outputPath,resultPath) = args
val conf = new SparkConf().setAppName(s"${this.getClass.getName}").setMaster("local[*]")
//搞定第二个需求
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(conf)
val sQLContext = new SQLContext(sc)
//在1.6版本时候默认的压缩方式还不是snappy,到了2.0之后默认是snappy
sQLContext.setConf("spark.sql.parquet.compression.codec", "snappy")
//读取字典文件
val dicMap: Map[String, String] = sc.textFile(resultPath).map(_.split("\t",-1)).filter(_.length>=5)
.map(arr=>{
// com.123.cn 爱奇艺
(arr(4),arr(1))
}).collect().toMap
//dicMap.take(100).toBuffer.foreach(println)
//将小文件广播出去
val broadcast = sc.broadcast(dicMap)
val df = sQLContext.read.parquet(inputPath)
df.map(row=>{
// 如果我们取到的是空值的话,那么将取字典文件中进行查询
var appname = row.getAs[String]("appname")
if(!StringUtils.isNotBlank(appname)){
// 这一块 做的是通过我们的时间APPId获取字典文件中对应的APPID
// 然后取到它的value
//com.123.cn 爱奇艺
appname = broadcast.value.getOrElse(row.getAs[String]("appid"),"unknow")
}
//val appname = broadcast.value.getOrElse(row.getAs[String]("appid"),"unknow")
//先把需要的字段拿出来,再进行操作
//处理 原始请求数,有效请求数,广告请求数
val requestmode = row.getAs[Int]("requestmode")
val processnode = row.getAs[Int]("processnode")
val iseffective = row.getAs[Int]("iseffective")
//参与竞价数,竞价成功数,展示数,点击数
val isbilling = row.getAs[Int]("isbilling")
val isbid = row.getAs[Int]("isbid")
val iswin = row.getAs[Int]("iswin")
val adorderid = row.getAs[Int]("adorderid")
// 处理 广告消费,广告成本
val winPrice = row.getAs[Double]("winprice")
val adpayment = row.getAs[Double]("adpayment")
//调用业务的方法
val reqlist = RptUtils.calculateReq(requestmode,processnode)
val rtblist = RptUtils.calculateRtb(iseffective,isbilling,isbid,iswin,adorderid,winPrice,adpayment)
val cliklist = RptUtils.calculateTimes(requestmode,iseffective)
(appname, reqlist++rtblist++cliklist)
}).reduceByKey((list1,list2)=>{
// list(0,2,1,5) list(2,5,4,7) zip((0,2),(2,5),(1,4),(5,7))
list1.zip(list2).map(t=>t._1+t._2)
}).map(t=>{
t._1+","+t._2.mkString(",")
}).take(1000).toBuffer.foreach(println)
}
}