spark算子数据变形代码示例

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Test04Rdd {

  //原始文件记录
  case class Sale(shopId: Int, date: String, volume: Float) {
    def symd = s"${shopId}_${date.ymd}"

    def v = print(s"{$shopId\t$date\t$volume},")

    /*def sym = s"${shopId}_${date.ym}"
    def sy = s"${shopId}_${date.y}"*/
  }

  //在原始对象基础上追加同比环比的结果
  case class SaleRatio(sale: Sale, ratio: Float) {
    def v = print(s"{${sale.shopId}\t${sale.date}\t${sale.volume}\t$ratio},")

  }

  implicit class strToAny(v: String) {
    def ti = v.toInt

    def tf = v.toFloat

    //v为Sale一条记录的格式
    def toSale = {
      val ps = v.split(",")
      Sale(ps(0).ti, ps(1), ps(2).tf)
    }

    def toSale(volume: Float) = {
      val ps = v.split("_")
      val ym = ps(1).split("-")
      Sale(ps(0).ti, ym(1), volume)
    }

    //v为日期格式
    def ymd = {
      v.split(" ")(0)
    }

    def ym = {
      val ps = v.ymd.split("-")
      s"${ps(0)}-${ps(1)}"
    }

    def y = {
      v.ym.split("-")(0)
    }

    //v为分组键:shopId+y+m+d
    def sym = {
      val ps = v.split("_")
      s"${ps(0)}_${ps(1).ym}"
    }

    def sy = {
      val ps = v.split("_")
      s"${ps(0)}_${ps(1).y}"
    }

    def s = v.split("_")(0)

    def dateYSM = {
      val ps = v.split(",")
      (s"${ps(0)}_${ps(1)}-${ps(2)}}", 0.0f)
    }
  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("sparkrdd-02")
    val sc = new SparkContext(conf)
    //路径:本地路径:file:///absolute_path 远程路径:hdfs://192.168.131.200:9820/absolute_path
    //月聚合:shop_year-month-day:日分组聚合
    //月聚合:shopId_year-month:分组聚合
    val months = Array("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12")
    //RDD[(String=>ShopId+year+month,Float=>SYMTotal)]
    val rddSYM: RDD[(String, Float)] = sc.textFile("file:///D:/Study/13_spark/cha01/file/sales5.txt", 5)
      //日聚合:业绩累加
      .map(x => {
        val sale = x.toSale
        //以shopId+y+m+d作为键实现日聚合
        (sale.symd, sale.volume)
      }).reduceByKey(_ + _) //groupByKey+reduce+combiner
      //月聚合:业绩累加
      .map(x =>
        //以shopId+y+m作为键实现月聚合
        (x._1.sym, x._2)
      ).reduceByKey(_ + _)
      .mapPartitions(it => it.map(x => {
        //(shopId,(sym,sum))
        (x._1.s, x)
      })).groupByKey() //一个店铺一个分区
      .mapPartitions(it => it.flatMap(_._2))
      .cache() //cache 可有可无,因为没有遇到action算子,因此所有算子处于懒加载模式
    //rddSYM.saveAsTextFile("output")

    val rddDateSYM: RDD[(String, Float)] = sc.textFile("file:///D:\\Study\\13_spark\\cha01\\file\\date.txt")
      .mapPartitions(it => it.map(x=>{
        val ysm = x.dateYSM
        (ysm._1.s,ysm)
      })).groupByKey()
      .mapPartitions(it=>it.flatMap(_._2))
      .cache()
    rddDateSYM.leftOuterJoin(rddSYM)
      .mapPartitions(it=>it.map(x=>{
        val v = x._2
        (x._1,if(v._2.isEmpty) v._1 else v._2.get)
      }))
      .coalesce(1)
      .sortByKey()
      .foreach(println)
  }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值