shop1,2019-01-18,500
shop1,2019-02-10,500
shop1,2019-02-10,200
shop1,2019-02-11,600
shop1,2019-02-12,400
shop1,2019-02-13,200
shop1,2019-02-15,100
shop1,2019-03-05,180
shop1,2019-04-05,280
shop1,2019-04-06,220
shop2,2019-02-10,100
shop2,2019-02-11,100
shop2,2019-02-13,100
-
需求:计算店铺的与销售额和累加到当前月的销售和
import doit.day05_t.utils.SparkUtils
import org.apache.spark.rdd.RDD
object RollupMthIncomeRDD {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.createContext(true)
val lines: RDD[String] = sc.textFile("src/main/scala/data/shop.csv")
//根据sid、月份进行聚合
val reduced: RDD[((String, String), Double)] = lines.map(e => {
val fields = e.split(",")
val sid = fields(0)
val dateStr = fields(1)
val mth = dateStr.substring(0, 7)
val money = fields(2).toDouble
//将shopid和mth合起来当成key
((sid, mth), money)
}).reduceByKey(_ + _)
//根据shop id 分组 排序
val result: RDD[(String, String, Double, Double)] = reduced.groupBy(_._1._1).mapValues(it => {
//将迭代器中的数据toList放入到内存
//并且按照月份排序【字典顺序】
val sorted: List[((String, String), Double)] = it.toList.sortBy(_._1._2)
var rollup = 0.0
//迭代数据
sorted.map(t => {
val sid = t._1._1
val mth = t._1._2
val mth_sales = t._2
rollup += mth_sales
(mth, mth_sales, rollup)
})
}).flatMapValues(lst => lst).map(t => (t._1, t._2._1, t._2._2, t._2._3))
result.foreach(println)
sc.stop()
}
}