移动平均法的实现--spark

案例场景
数据结构(k,t,v) k是id,t是时间,v是这个id在某个时间点对应的值

股票代码,时间,收盘价
 
AA,2017-1-7,10.8
 
AA,2017-1-8,10.9
 
AA,2017-1-9,11
 
AA,2017-1-30,10.5
 
BB,2017-1-31,10.7
 
BB,2017-2-1,10.9
 
BB,2017-2-2,11.1
 

移动平均-内存排序

/**
  * 在内存中进行排序计算移动平均值
  **/
object MovingAverageInMemory {
    def main(args: Array[String]): Unit = {
        if (args.length < 3) {
            println("Usage: MovingAverageInMemory <period> <input-path> <output-path>")
            sys.exit(1)
        }
        //移动宽度
        val period: Int = args(0).toInt
        //文件输入路径
        val inputPath: String = args(1)
        //输出路径
        val outputPath: String = args(2)

        val sparkConf: SparkConf = new SparkConf()
            .setMaster("local[1]")
            .setAppName("MovingAverageInMemory")
        //构建Spark上下文
        val sc: SparkContext = SparkContext.getOrCreate(sparkConf)
        //广播变量
        val brodcastPeriod: Broadcast[Int] = sc.broadcast(period)
        //读取文件原始数据
        val rawData: RDD[String] = sc.textFile(inputPath)
        val keyValue: RDD[(String, (String, Double))] = rawData.map(line => {
            val tokens = line.split(",")
            (tokens(0), (tokens(1), tokens(2).toDouble))
        })
        val groupValue: RDD[(String, List[(String, Double)])] = keyValue.combineByKey(
            (v: (String, Double)) => List(v),
            (c: List[(String, Double)], v: (String, Double)) => c :+ v,
            (c1: List[(String, Double)], c2: List[(String, Double)]) => c1 ::: c2
        )
        val movingAverage: RDD[(String, Seq[(String, Double)])] = groupValue.mapValues(values => {
            val dateFormat: SimpleDateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
            // 在内存中排序,对于大型数据集谨慎使用这样的排序
            val sortedValues: Seq[(Long, Double)] = values.map(s => (dateFormat.parse(s._1).getTime, s._2)).toSeq.sortBy(_._1)
            val queue: mutable.Queue[Double] = new scala.collection.mutable.Queue[Double]()
            for (tup <- sortedValues) yield {
                queue.enqueue(tup._2)
                if (queue.size > brodcastPeriod.value) {
                    queue.dequeue
                }
                (dateFormat.format(new java.util.Date(tup._1)), (queue.sum / queue.size))
            }
        })

        val formattedResult: RDD[String] = movingAverage.sortByKey().flatMap(kv => {
            kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
        })

        //保存结果
        //formattedResult.saveAsTextFile(outputPath)
        formattedResult.foreach(println)
        sc.stop()
    }
}

移动平均-自定义排序

/**
  * 自定义排序计算移动平均值
  **/
object MovingAverageCustomSort {
    def main(args: Array[String]): Unit = {
        if (args.length < 3) {
            println("Usage: MovingAverageCustomSort <period> <input-path> <output-path>")
            sys.exit(1)
        }
        //移动宽度
        val period: Int = args(0).toInt
        //输入路径
        val inputPath: String = args(1)
        //输出路径
        val outputPath: String = args(2)
        //分区数
        val numPartitions: Int = 4
        val sparkConf: SparkConf = new SparkConf()
            .setMaster("local[2]")
            .setAppName("MovingAverageCustomSort")
        //构建Spark上下文
        val sc: SparkContext = SparkContext.getOrCreate(sparkConf)

        val brodcastPeriod: Broadcast[Int] = sc.broadcast(period)
        //读取原始文件数据
        val rawData: RDD[String] = sc.textFile(inputPath)

        // Key contains part of value (closing date in this case)
        val valueTokey: RDD[(CompositeKey, TimeSeriesData)] = rawData.map(line => {
            val tokens = line.split(",")
            val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
            val timestamp = dateFormat.parse(tokens(1)).getTime
            (CompositeKey(tokens(0), timestamp), TimeSeriesData(timestamp, tokens(2).toDouble))
        })

        //二次排序
        val sortedData: RDD[(CompositeKey, TimeSeriesData)] =
            valueTokey.repartitionAndSortWithinPartitions(new CompositeKeyPartitioner(numPartitions))

        val groupData: RDD[(String, Iterable[TimeSeriesData])] = sortedData.map(k => (k._1.stockSymbol, (k._2))).groupByKey()

        val movingAverage: RDD[(String, Iterable[(String, Double)])] = groupData.mapValues(values => {
            val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
            val queue = new scala.collection.mutable.Queue[Double]()
            for (timeSeriesData <- values) yield {
                queue.enqueue(timeSeriesData.closingStockPrice)
                if (queue.size > brodcastPeriod.value) {
                    queue.dequeue
                }
                (dateFormat.format(new java.util.Date(timeSeriesData.timeStamp)), (queue.sum / queue.size))
            }
        })

        val formattedResult: RDD[String] = movingAverage.sortByKey().flatMap(kv => {
            kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
        })
        //保存结果
        //formattedResult.saveAsTextFile(outputPath)
        formattedResult.foreach(println)
        // done
        sc.stop()
    }
}


/**
  * 定义时间序列类
  **/
case class TimeSeriesData(timeStamp: Long, closingStockPrice: Double)

/**
  * 自定义排序复合类
  **/
case class CompositeKey(stockSymbol: String, timeStamp: Long)

object CompositeKey {
    implicit def ordering[A <: CompositeKey]: Ordering[A] = {
        Ordering.by(fk => (fk.stockSymbol, fk.timeStamp))
    }
}

/**
  * 排序分区数定义
  **/
class CompositeKeyPartitioner(partitions: Int) extends Partitioner {
    require(partitions >= 0, s"Number of partitions ($partitions) must greater than 0.")

    //分区数
    def numPartitions: Int = partitions

    def getPartition(key: Any): Int = key match {
        case k: CompositeKey => math.abs(k.stockSymbol.hashCode % numPartitions)
        case null => 0
        case _ => math.abs(key.hashCode % numPartitions)
    }

    override def equals(other: Any): Boolean = other match {
        case h: CompositeKeyPartitioner => h.numPartitions == numPartitions
        case _ => false
    }

    override def hashCode: Int = numPartitions
}

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

orange大数据技术探索者

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值