移动平均法的实现--spark

最新推荐文章于 2022-10-13 21:26:22 发布

orange大数据技术探索者

最新推荐文章于 2022-10-13 21:26:22 发布

阅读量1k

点赞数 1

分类专栏： # spark # 机器学习文章标签：大数据算法

本文链接：https://blog.csdn.net/weixin_43283487/article/details/88737457

版权

spark 同时被 2 个专栏收录

29 篇文章 0 订阅

订阅专栏

机器学习

10 篇文章 1 订阅

订阅专栏

案例场景
数据结构（k,t,v） k是id,t是时间，v是这个id在某个时间点对应的值

股票代码,时间,收盘价
 
AA,2017-1-7,10.8
 
AA,2017-1-8,10.9
 
AA,2017-1-9,11
 
AA,2017-1-30,10.5
 
BB,2017-1-31,10.7
 
BB,2017-2-1,10.9
 
BB,2017-2-2,11.1

移动平均-内存排序

/**
  * 在内存中进行排序计算移动平均值
  **/
object MovingAverageInMemory {
    def main(args: Array[String]): Unit = {
        if (args.length < 3) {
            println("Usage: MovingAverageInMemory <period> <input-path> <output-path>")
            sys.exit(1)
        }
        //移动宽度
        val period: Int = args(0).toInt
        //文件输入路径
        val inputPath: String = args(1)
        //输出路径
        val outputPath: String = args(2)

        val sparkConf: SparkConf = new SparkConf()
            .setMaster("local[1]")
            .setAppName("MovingAverageInMemory")
        //构建Spark上下文
        val sc: SparkContext = SparkContext.getOrCreate(sparkConf)
        //广播变量
        val brodcastPeriod: Broadcast[Int] = sc.broadcast(period)
        //读取文件原始数据
        val rawData: RDD[String] = sc.textFile(inputPath)
        val keyValue: RDD[(String, (String, Double))] = rawData.map(line => {
            val tokens = line.split(",")
            (tokens(0), (tokens(1), tokens(2).toDouble))
        })
        val groupValue: RDD[(String, List[(String, Double)])] = keyValue.combineByKey(
            (v: (String, Double)) => List(v),
            (c: List[(String, Double)], v: (String, Double)) => c :+ v,
            (c1: List[(String, Double)], c2: List[(String, Double)]) => c1 ::: c2
        )
        val movingAverage: RDD[(String, Seq[(String, Double)])] = groupValue.mapValues(values => {
            val dateFormat: SimpleDateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
            // 在内存中排序，对于大型数据集谨慎使用这样的排序
            val sortedValues: Seq[(Long, Double)] = values.map(s => (dateFormat.parse(s._1).getTime, s._2)).toSeq.sortBy(_._1)
            val queue: mutable.Queue[Double] = new scala.collection.mutable.Queue[Double]()
            for (tup <- sortedValues) yield {
                queue.enqueue(tup._2)
                if (queue.size > brodcastPeriod.value) {
                    queue.dequeue
                }
                (dateFormat.format(new java.util.Date(tup._1)), (queue.sum / queue.size))
            }
        })

        val formattedResult: RDD[String] = movingAverage.sortByKey().flatMap(kv => {
            kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
        })

        //保存结果
        //formattedResult.saveAsTextFile(outputPath)
        formattedResult.foreach(println)
        sc.stop()
    }
}

移动平均-自定义排序

/**
  * 自定义排序计算移动平均值
  **/
object MovingAverageCustomSort {
    def main(args: Array[String]): Unit = {
        if (args.length < 3) {
            println("Usage: MovingAverageCustomSort <period> <input-path> <output-path>")
            sys.exit(1)
        }
        //移动宽度
        val period: Int = args(0).toInt
        //输入路径
        val inputPath: String = args(1)
        //输出路径
        val outputPath: String = args(2)
        //分区数
        val numPartitions: Int = 4
        val sparkConf: SparkConf = new SparkConf()
            .setMaster("local[2]")
            .setAppName("MovingAverageCustomSort")
        //构建Spark上下文
        val sc: SparkContext = SparkContext.getOrCreate(sparkConf)

        val brodcastPeriod: Broadcast[Int] = sc.broadcast(period)
        //读取原始文件数据
        val rawData: RDD[String] = sc.textFile(inputPath)

        // Key contains part of value (closing date in this case)
        val valueTokey: RDD[(CompositeKey, TimeSeriesData)] = rawData.map(line => {
            val tokens = line.split(",")
            val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
            val timestamp = dateFormat.parse(tokens(1)).getTime
            (CompositeKey(tokens(0), timestamp), TimeSeriesData(timestamp, tokens(2).toDouble))
        })

        //二次排序
        val sortedData: RDD[(CompositeKey, TimeSeriesData)] =
            valueTokey.repartitionAndSortWithinPartitions(new CompositeKeyPartitioner(numPartitions))

        val groupData: RDD[(String, Iterable[TimeSeriesData])] = sortedData.map(k => (k._1.stockSymbol, (k._2))).groupByKey()

        val movingAverage: RDD[(String, Iterable[(String, Double)])] = groupData.mapValues(values => {
            val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
            val queue = new scala.collection.mutable.Queue[Double]()
            for (timeSeriesData <- values) yield {
                queue.enqueue(timeSeriesData.closingStockPrice)
                if (queue.size > brodcastPeriod.value) {
                    queue.dequeue
                }
                (dateFormat.format(new java.util.Date(timeSeriesData.timeStamp)), (queue.sum / queue.size))
            }
        })

        val formattedResult: RDD[String] = movingAverage.sortByKey().flatMap(kv => {
            kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
        })
        //保存结果
        //formattedResult.saveAsTextFile(outputPath)
        formattedResult.foreach(println)
        // done
        sc.stop()
    }
}


/**
  * 定义时间序列类
  **/
case class TimeSeriesData(timeStamp: Long, closingStockPrice: Double)

/**
  * 自定义排序复合类
  **/
case class CompositeKey(stockSymbol: String, timeStamp: Long)

object CompositeKey {
    implicit def ordering[A <: CompositeKey]: Ordering[A] = {
        Ordering.by(fk => (fk.stockSymbol, fk.timeStamp))
    }
}

/**
  * 排序分区数定义
  **/
class CompositeKeyPartitioner(partitions: Int) extends Partitioner {
    require(partitions >= 0, s"Number of partitions ($partitions) must greater than 0.")

    //分区数
    def numPartitions: Int = partitions

    def getPartition(key: Any): Int = key match {
        case k: CompositeKey => math.abs(k.stockSymbol.hashCode % numPartitions)
        case null => 0
        case _ => math.abs(key.hashCode % numPartitions)
    }

    override def equals(other: Any): Boolean = other match {
        case h: CompositeKeyPartitioner => h.numPartitions == numPartitions
        case _ => false
    }

    override def hashCode: Int = numPartitions
}

orange大数据技术探索者

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
打赏
0
评论
移动平均法的实现--spark

案例场景数据结构（k,t,v） k是id,t是时间，v是这个id在某个时间点对应的值股票代码,时间,收盘价 AA,2017-1-7,10.8 AA,2017-1-8,10.9 AA,2017-1-9,11 AA,2017-1-30,10.5 BB,2017-1-31,10.7 BB,2017-2-1,10.9 BB,2017-2-2,11.1 ...
复制链接

扫一扫