import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Test04Rdd {
//原始文件记录
case class Sale(shopId: Int, date: String, volume: Float) {
def symd = s"${shopId}_${date.ymd}"
def v = print(s"{$shopId\t$date\t$volume},")
/*def sym = s"${shopId}_${date.ym}"
def sy = s"${shopId}_${date.y}"*/
}
//在原始对象基础上追加同比环比的结果
case class SaleRatio(sale: Sale, ratio: Float) {
def v = print(s"{${sale.shopId}\t${sale.date}\t${sale.volume}\t$ratio},")
}
implicit class strToAny(v: String) {
def ti = v.toInt
def tf = v.toFloat
//v为Sale一条记录的格式
def toSale = {
val ps = v.split(",")
Sale(ps(0).ti, ps(1), ps(2).tf)
}
def toSale(volume: Float) = {
val ps = v.split("_")
val ym = ps(1).split("-")
Sale(ps(0).ti, ym(1), volume)
}
//v为日期格式
def ymd = {
v.split(" ")(0)
}
def ym = {
val ps = v.ymd.split("-")
s"${ps(0)}-${ps(1)}"
}
def y = {
v.ym.split("-")(0)
}
//v为分组键:shopId+y+m+d
def sym = {
val ps = v.split("_")
s"${ps(0)}_${ps(1).ym}"
}
def sy = {
val ps = v.split("_")
s"${ps(0)}_${ps(1).y}"
}
def s = v.split("_")(0)
def dateYSM = {
val ps = v.split(",")
(s"${ps(0)}_${ps(1)}-${ps(2)}}", 0.0f)
}
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkrdd-02")
val sc = new SparkContext(conf)
//路径:本地路径:file:///absolute_path 远程路径:hdfs://192.168.131.200:9820/absolute_path
//月聚合:shop_year-month-day:日分组聚合
//月聚合:shopId_year-month:分组聚合
val months = Array("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12")
//RDD[(String=>ShopId+year+month,Float=>SYMTotal)]
val rddSYM: RDD[(String, Float)] = sc.textFile("file:///D:/Study/13_spark/cha01/file/sales5.txt", 5)
//日聚合:业绩累加
.map(x => {
val sale = x.toSale
//以shopId+y+m+d作为键实现日聚合
(sale.symd, sale.volume)
}).reduceByKey(_ + _) //groupByKey+reduce+combiner
//月聚合:业绩累加
.map(x =>
//以shopId+y+m作为键实现月聚合
(x._1.sym, x._2)
).reduceByKey(_ + _)
.mapPartitions(it => it.map(x => {
//(shopId,(sym,sum))
(x._1.s, x)
})).groupByKey() //一个店铺一个分区
.mapPartitions(it => it.flatMap(_._2))
.cache() //cache 可有可无,因为没有遇到action算子,因此所有算子处于懒加载模式
//rddSYM.saveAsTextFile("output")
val rddDateSYM: RDD[(String, Float)] = sc.textFile("file:///D:\\Study\\13_spark\\cha01\\file\\date.txt")
.mapPartitions(it => it.map(x=>{
val ysm = x.dateYSM
(ysm._1.s,ysm)
})).groupByKey()
.mapPartitions(it=>it.flatMap(_._2))
.cache()
rddDateSYM.leftOuterJoin(rddSYM)
.mapPartitions(it=>it.map(x=>{
val v = x._2
(x._1,if(v._2.isEmpty) v._1 else v._2.get)
}))
.coalesce(1)
.sortByKey()
.foreach(println)
}
}
spark算子数据变形代码示例
最新推荐文章于 2022-10-11 11:19:30 发布