案例场景
数据结构(k,t,v) k是id,t是时间,v是这个id在某个时间点对应的值
股票代码,时间,收盘价
AA,2017-1-7,10.8
AA,2017-1-8,10.9
AA,2017-1-9,11
AA,2017-1-30,10.5
BB,2017-1-31,10.7
BB,2017-2-1,10.9
BB,2017-2-2,11.1
移动平均-内存排序
/**
* 在内存中进行排序计算移动平均值
**/
object MovingAverageInMemory {
def main(args: Array[String]): Unit = {
if (args.length < 3) {
println("Usage: MovingAverageInMemory <period> <input-path> <output-path>")
sys.exit(1)
}
//移动宽度
val period: Int = args(0).toInt
//文件输入路径
val inputPath: String = args(1)
//输出路径
val outputPath: String = args(2)
val sparkConf: SparkConf = new SparkConf()
.setMaster("local[1]")
.setAppName("MovingAverageInMemory")
//构建Spark上下文
val sc: SparkContext = SparkContext.getOrCreate(sparkConf)
//广播变量
val brodcastPeriod: Broadcast[Int] = sc.broadcast(period)
//读取文件原始数据
val rawData: RDD[String] = sc.textFile(inputPath)
val keyValue: RDD[(String, (String, Double))] = rawData.map(line => {
val tokens = line.split(",")
(tokens(0), (tokens(1), tokens(2).toDouble))
})
val groupValue: RDD[(String, List[(String, Double)])] = keyValue.combineByKey(
(v: (String, Double)) => List(v),
(c: List[(String, Double)], v: (String, Double)) => c :+ v,
(c1: List[(String, Double)], c2: List[(String, Double)]) => c1 ::: c2
)
val movingAverage: RDD[(String, Seq[(String, Double)])] = groupValue.mapValues(values => {
val dateFormat: SimpleDateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
// 在内存中排序,对于大型数据集谨慎使用这样的排序
val sortedValues: Seq[(Long, Double)] = values.map(s => (dateFormat.parse(s._1).getTime, s._2)).toSeq.sortBy(_._1)
val queue: mutable.Queue[Double] = new scala.collection.mutable.Queue[Double]()
for (tup <- sortedValues) yield {
queue.enqueue(tup._2)
if (queue.size > brodcastPeriod.value) {
queue.dequeue
}
(dateFormat.format(new java.util.Date(tup._1)), (queue.sum / queue.size))
}
})
val formattedResult: RDD[String] = movingAverage.sortByKey().flatMap(kv => {
kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
})
//保存结果
//formattedResult.saveAsTextFile(outputPath)
formattedResult.foreach(println)
sc.stop()
}
}
移动平均-自定义排序
/**
* 自定义排序计算移动平均值
**/
object MovingAverageCustomSort {
def main(args: Array[String]): Unit = {
if (args.length < 3) {
println("Usage: MovingAverageCustomSort <period> <input-path> <output-path>")
sys.exit(1)
}
//移动宽度
val period: Int = args(0).toInt
//输入路径
val inputPath: String = args(1)
//输出路径
val outputPath: String = args(2)
//分区数
val numPartitions: Int = 4
val sparkConf: SparkConf = new SparkConf()
.setMaster("local[2]")
.setAppName("MovingAverageCustomSort")
//构建Spark上下文
val sc: SparkContext = SparkContext.getOrCreate(sparkConf)
val brodcastPeriod: Broadcast[Int] = sc.broadcast(period)
//读取原始文件数据
val rawData: RDD[String] = sc.textFile(inputPath)
// Key contains part of value (closing date in this case)
val valueTokey: RDD[(CompositeKey, TimeSeriesData)] = rawData.map(line => {
val tokens = line.split(",")
val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
val timestamp = dateFormat.parse(tokens(1)).getTime
(CompositeKey(tokens(0), timestamp), TimeSeriesData(timestamp, tokens(2).toDouble))
})
//二次排序
val sortedData: RDD[(CompositeKey, TimeSeriesData)] =
valueTokey.repartitionAndSortWithinPartitions(new CompositeKeyPartitioner(numPartitions))
val groupData: RDD[(String, Iterable[TimeSeriesData])] = sortedData.map(k => (k._1.stockSymbol, (k._2))).groupByKey()
val movingAverage: RDD[(String, Iterable[(String, Double)])] = groupData.mapValues(values => {
val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd")
val queue = new scala.collection.mutable.Queue[Double]()
for (timeSeriesData <- values) yield {
queue.enqueue(timeSeriesData.closingStockPrice)
if (queue.size > brodcastPeriod.value) {
queue.dequeue
}
(dateFormat.format(new java.util.Date(timeSeriesData.timeStamp)), (queue.sum / queue.size))
}
})
val formattedResult: RDD[String] = movingAverage.sortByKey().flatMap(kv => {
kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString()))
})
//保存结果
//formattedResult.saveAsTextFile(outputPath)
formattedResult.foreach(println)
// done
sc.stop()
}
}
/**
* 定义时间序列类
**/
case class TimeSeriesData(timeStamp: Long, closingStockPrice: Double)
/**
* 自定义排序复合类
**/
case class CompositeKey(stockSymbol: String, timeStamp: Long)
object CompositeKey {
implicit def ordering[A <: CompositeKey]: Ordering[A] = {
Ordering.by(fk => (fk.stockSymbol, fk.timeStamp))
}
}
/**
* 排序分区数定义
**/
class CompositeKeyPartitioner(partitions: Int) extends Partitioner {
require(partitions >= 0, s"Number of partitions ($partitions) must greater than 0.")
//分区数
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case k: CompositeKey => math.abs(k.stockSymbol.hashCode % numPartitions)
case null => 0
case _ => math.abs(key.hashCode % numPartitions)
}
override def equals(other: Any): Boolean = other match {
case h: CompositeKeyPartitioner => h.numPartitions == numPartitions
case _ => false
}
override def hashCode: Int = numPartitions
}