Spark移动平均:时间序列数据平均值

一、内存排序

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext


object MovingAverageInMemory {

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("MovingAverageInMemory").setMaster("local")
    val sc = new SparkContext(sparkConf)

    val window = 3
    val input = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1.txt"
    val output = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1"

    val brodcastWindow = sc.broadcast(window)

    val rawData = sc.textFile(input)
    /*
    * GOOG,2004-11-04,184.70
      GOOG,2004-11-03,191.67
      GOOG,2004-11-02,194.87
      AAPL,2013-10-09,486.59
      AAPL,2013-10-08,480.94
      AAPL,2013-10-07,487.75
      AAPL,2013-10-04,483.03
      AAPL,2013-10-03,483.41
      IBM,2013-09-30,185.18
      IBM,2013-09-27,186.92
      IBM,2013-09-26,190.22
      IBM,2013-09-25,189.47
      GOOG,2013-07-19,896.60
      GOOG,2013-07-18,910.68
      GOOG,2013-07-17,918.55
    * */
    val keyValue = rawData.map(line => {
      val tokens = line.split(",")
      (tokens(0), (tokens(1), tokens(2).toDouble))
    })
    /*
    * (GOOG,(2004-11-04,184.7))
      (GOOG,(2004-11-03,191.67))
      (GOOG,(2004-11-02,194.87))
      (AAPL,(2013-10-09,486.59))
      (AAPL,(2013-10-08,480.94))
      (AAPL,(2013-10-07,487.75))
      (AAPL,(2013-10-04,483.03))
      (AAPL,(2013-10-03,483.41))
      (IBM,(2013-09-30,185.18))
      (IBM,(2013-09-27,186.92))
      (IBM,(2013-09-26,190.22))
      (IBM,(2013-09-25,189.47))
      (GOOG,(2013-07-19,896.6))
      (GOOG,(2013-07-18,910.68))
      (GOOG,(2013-07-17,918.55))
    * */
    val groupByStockSymbol = keyValue.groupByKey()
    /*
    * (IBM,CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47)))
      (GOOG,CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55)))
      (AAPL,CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41)))
    * */
    val result = groupByStockSymbol.mapValues(values => {
      val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
      val sortedValues = values.map(s => (dateFormat.parse(s._1).getTime.toLong, s._2)).toSeq.sortBy(_._1)
      /*
      * values:CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47))
        sortedValues:List((1380038400000,189.47), (1380124800000,190.22), (1380211200000,186.92), (1380470400000,185.18))
        values:CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55))
        sortedValues:List((1099324800000,194.87), (1099411200000,191.67), (1099497600000,184.7), (1373990400000,918.55), (1374076800000,910.68), (1374163200000,896.6))
        values:CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41))
        sortedValues:List((1380729600000,483.41), (1380816000000,483.03), (1381075200000,487.75), (1381161600000,480.94), (1381248000000,486.59))
      *
      * */
      val queue = new scala.collection.mutable.Queue[Double]()
      for (tup <- sortedValues) yield {
        queue.enqueue(tup._2)
        if (queue.size > brodcastWindow.value)
          queue.dequeue
        (dateFormat.format(new java.util.Date(tup._1)), (queue.sum / queue.size))
      }
    })
    /*
    * (IBM,List((2013-09-25,189.47), (2013-09-26,189.845), (2013-09-27,188.87), (2013-09-30,187.43999999999997)))
      (GOOG,List((2004-11-02,194.87), (2004-11-03,193.26999999999998), (2004-11-04,190.41333333333333), (2013-07-17,431.64000000000004), (2013-07-18,671.31), (2013-07-19,908.61)))
      (AAPL,List((2013-10-03,483.41), (2013-10-04,483.22), (2013-10-07,484.73), (2013-10-08,483.9066666666667), (2013-10-09,485.0933333333333)))
    * */
    val formattedResult = result.flatMap(kv => {
      kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
    })
    /*
    * IBM,2013-09-25,189.47
      IBM,2013-09-26,189.845
      IBM,2013-09-27,188.87
      IBM,2013-09-30,187.43999999999997
      GOOG,2004-11-02,194.87
      GOOG,2004-11-03,193.26999999999998
      GOOG,2004-11-04,190.41333333333333
      GOOG,2013-07-17,431.64000000000004
      GOOG,2013-07-18,671.31
      GOOG,2013-07-19,908.61
      AAPL,2013-10-03,483.41
      AAPL,2013-10-04,483.22
      AAPL,2013-10-07,484.73
      AAPL,2013-10-08,483.9066666666667
      AAPL,2013-10-09,485.0933333333333
    * */
    formattedResult.saveAsTextFile(output)
    sc.stop()
  }
}


2、

自定义分区器CompositeKeyPartitioner


import MovingAverage.CompositeKey
import org.apache.spark.Partitioner

class CompositeKeyPartitioner(partitions: Int) extends Partitioner {
  require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")

  def numPartitions: Int = partitions

  def getPartition(key: Any): Int = key match {
    case k: CompositeKey => math.abs(k.stockSymbol.hashCode % numPartitions)
    case null            => 0
    case _               => math.abs(key.hashCode % numPartitions)
  }

  override def equals(other: Any): Boolean = other match {
    case h: CompositeKeyPartitioner => h.numPartitions == numPartitions
    case _                          => false
  }

  override def hashCode: Int = numPartitions
}



import org.apache.spark.SparkConf
import org.apache.spark.SparkContext


object MovingAverageInMemory {

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("MovingAverageInMemory").setMaster("local")
    val sc = new SparkContext(sparkConf)

    val window = 3
    val input = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1.txt"
    val output = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1"

    val brodcastWindow = sc.broadcast(window)

    val rawData = sc.textFile(input)
    /*
    * GOOG,2004-11-04,184.70
      GOOG,2004-11-03,191.67
      GOOG,2004-11-02,194.87
      AAPL,2013-10-09,486.59
      AAPL,2013-10-08,480.94
      AAPL,2013-10-07,487.75
      AAPL,2013-10-04,483.03
      AAPL,2013-10-03,483.41
      IBM,2013-09-30,185.18
      IBM,2013-09-27,186.92
      IBM,2013-09-26,190.22
      IBM,2013-09-25,189.47
      GOOG,2013-07-19,896.60
      GOOG,2013-07-18,910.68
      GOOG,2013-07-17,918.55
    * */
    val keyValue = rawData.map(line => {
      val tokens = line.split(",")
      (tokens(0), (tokens(1), tokens(2).toDouble))
    })
    /*
    * (GOOG,(2004-11-04,184.7))
      (GOOG,(2004-11-03,191.67))
      (GOOG,(2004-11-02,194.87))
      (AAPL,(2013-10-09,486.59))
      (AAPL,(2013-10-08,480.94))
      (AAPL,(2013-10-07,487.75))
      (AAPL,(2013-10-04,483.03))
      (AAPL,(2013-10-03,483.41))
      (IBM,(2013-09-30,185.18))
      (IBM,(2013-09-27,186.92))
      (IBM,(2013-09-26,190.22))
      (IBM,(2013-09-25,189.47))
      (GOOG,(2013-07-19,896.6))
      (GOOG,(2013-07-18,910.68))
      (GOOG,(2013-07-17,918.55))
    * */
    val groupByStockSymbol = keyValue.groupByKey()
    /*
    * (IBM,CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47)))
      (GOOG,CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55)))
      (AAPL,CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41)))
    * */
    val result = groupByStockSymbol.mapValues(values => {
      val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
      val sortedValues = values.map(s => (dateFormat.parse(s._1).getTime.toLong, s._2)).toSeq.sortBy(_._1)
      /*
      * values:CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47))
        sortedValues:List((1380038400000,189.47), (1380124800000,190.22), (1380211200000,186.92), (1380470400000,185.18))
        values:CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55))
        sortedValues:List((1099324800000,194.87), (1099411200000,191.67), (1099497600000,184.7), (1373990400000,918.55), (1374076800000,910.68), (1374163200000,896.6))
        values:CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41))
        sortedValues:List((1380729600000,483.41), (1380816000000,483.03), (1381075200000,487.75), (1381161600000,480.94), (1381248000000,486.59))
      *
      * */
      val queue = new scala.collection.mutable.Queue[Double]()
      for (tup <- sortedValues) yield {
        queue.enqueue(tup._2)
        if (queue.size > brodcastWindow.value)
          queue.dequeue
        (dateFormat.format(new java.util.Date(tup._1)), (queue.sum / queue.size))
      }
    })
    /*
    * (IBM,List((2013-09-25,189.47), (2013-09-26,189.845), (2013-09-27,188.87), (2013-09-30,187.43999999999997)))
      (GOOG,List((2004-11-02,194.87), (2004-11-03,193.26999999999998), (2004-11-04,190.41333333333333), (2013-07-17,431.64000000000004), (2013-07-18,671.31), (2013-07-19,908.61)))
      (AAPL,List((2013-10-03,483.41), (2013-10-04,483.22), (2013-10-07,484.73), (2013-10-08,483.9066666666667), (2013-10-09,485.0933333333333)))
    * */
    val formattedResult = result.flatMap(kv => {
      kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
    })
    /*
    * IBM,2013-09-25,189.47
      IBM,2013-09-26,189.845
      IBM,2013-09-27,188.87
      IBM,2013-09-30,187.43999999999997
      GOOG,2004-11-02,194.87
      GOOG,2004-11-03,193.26999999999998
      GOOG,2004-11-04,190.41333333333333
      GOOG,2013-07-17,431.64000000000004
      GOOG,2013-07-18,671.31
      GOOG,2013-07-19,908.61
      AAPL,2013-10-03,483.41
      AAPL,2013-10-04,483.22
      AAPL,2013-10-07,484.73
      AAPL,2013-10-08,483.9066666666667
      AAPL,2013-10-09,485.0933333333333
    * */
    formattedResult.saveAsTextFile(output)
    sc.stop()
  }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值