一、内存排序
import org.apache.spark.SparkConf import org.apache.spark.SparkContext object MovingAverageInMemory { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("MovingAverageInMemory").setMaster("local") val sc = new SparkContext(sparkConf) val window = 3 val input = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1.txt" val output = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1" val brodcastWindow = sc.broadcast(window) val rawData = sc.textFile(input) /* * GOOG,2004-11-04,184.70 GOOG,2004-11-03,191.67 GOOG,2004-11-02,194.87 AAPL,2013-10-09,486.59 AAPL,2013-10-08,480.94 AAPL,2013-10-07,487.75 AAPL,2013-10-04,483.03 AAPL,2013-10-03,483.41 IBM,2013-09-30,185.18 IBM,2013-09-27,186.92 IBM,2013-09-26,190.22 IBM,2013-09-25,189.47 GOOG,2013-07-19,896.60 GOOG,2013-07-18,910.68 GOOG,2013-07-17,918.55 * */ val keyValue = rawData.map(line => { val tokens = line.split(",") (tokens(0), (tokens(1), tokens(2).toDouble)) }) /* * (GOOG,(2004-11-04,184.7)) (GOOG,(2004-11-03,191.67)) (GOOG,(2004-11-02,194.87)) (AAPL,(2013-10-09,486.59)) (AAPL,(2013-10-08,480.94)) (AAPL,(2013-10-07,487.75)) (AAPL,(2013-10-04,483.03)) (AAPL,(2013-10-03,483.41)) (IBM,(2013-09-30,185.18)) (IBM,(2013-09-27,186.92)) (IBM,(2013-09-26,190.22)) (IBM,(2013-09-25,189.47)) (GOOG,(2013-07-19,896.6)) (GOOG,(2013-07-18,910.68)) (GOOG,(2013-07-17,918.55)) * */ val groupByStockSymbol = keyValue.groupByKey() /* * (IBM,CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47))) (GOOG,CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55))) (AAPL,CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41))) * */ val result = groupByStockSymbol.mapValues(values => { val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd") val sortedValues = values.map(s => (dateFormat.parse(s._1).getTime.toLong, s._2)).toSeq.sortBy(_._1) /* * values:CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47)) sortedValues:List((1380038400000,189.47), (1380124800000,190.22), (1380211200000,186.92), (1380470400000,185.18)) values:CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55)) sortedValues:List((1099324800000,194.87), (1099411200000,191.67), (1099497600000,184.7), (1373990400000,918.55), (1374076800000,910.68), (1374163200000,896.6)) values:CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41)) sortedValues:List((1380729600000,483.41), (1380816000000,483.03), (1381075200000,487.75), (1381161600000,480.94), (1381248000000,486.59)) * * */ val queue = new scala.collection.mutable.Queue[Double]() for (tup <- sortedValues) yield { queue.enqueue(tup._2) if (queue.size > brodcastWindow.value) queue.dequeue (dateFormat.format(new java.util.Date(tup._1)), (queue.sum / queue.size)) } }) /* * (IBM,List((2013-09-25,189.47), (2013-09-26,189.845), (2013-09-27,188.87), (2013-09-30,187.43999999999997))) (GOOG,List((2004-11-02,194.87), (2004-11-03,193.26999999999998), (2004-11-04,190.41333333333333), (2013-07-17,431.64000000000004), (2013-07-18,671.31), (2013-07-19,908.61))) (AAPL,List((2013-10-03,483.41), (2013-10-04,483.22), (2013-10-07,484.73), (2013-10-08,483.9066666666667), (2013-10-09,485.0933333333333))) * */ val formattedResult = result.flatMap(kv => { kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString())) }) /* * IBM,2013-09-25,189.47 IBM,2013-09-26,189.845 IBM,2013-09-27,188.87 IBM,2013-09-30,187.43999999999997 GOOG,2004-11-02,194.87 GOOG,2004-11-03,193.26999999999998 GOOG,2004-11-04,190.41333333333333 GOOG,2013-07-17,431.64000000000004 GOOG,2013-07-18,671.31 GOOG,2013-07-19,908.61 AAPL,2013-10-03,483.41 AAPL,2013-10-04,483.22 AAPL,2013-10-07,484.73 AAPL,2013-10-08,483.9066666666667 AAPL,2013-10-09,485.0933333333333 * */ formattedResult.saveAsTextFile(output) sc.stop() } }
2、
自定义分区器CompositeKeyPartitioner
import MovingAverage.CompositeKey import org.apache.spark.Partitioner class CompositeKeyPartitioner(partitions: Int) extends Partitioner { require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.") def numPartitions: Int = partitions def getPartition(key: Any): Int = key match { case k: CompositeKey => math.abs(k.stockSymbol.hashCode % numPartitions) case null => 0 case _ => math.abs(key.hashCode % numPartitions) } override def equals(other: Any): Boolean = other match { case h: CompositeKeyPartitioner => h.numPartitions == numPartitions case _ => false } override def hashCode: Int = numPartitions }
import org.apache.spark.SparkConf import org.apache.spark.SparkContext object MovingAverageInMemory { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("MovingAverageInMemory").setMaster("local") val sc = new SparkContext(sparkConf) val window = 3 val input = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1.txt" val output = "file:///media/chenjie/0009418200012FF3/ubuntu/gupiao1" val brodcastWindow = sc.broadcast(window) val rawData = sc.textFile(input) /* * GOOG,2004-11-04,184.70 GOOG,2004-11-03,191.67 GOOG,2004-11-02,194.87 AAPL,2013-10-09,486.59 AAPL,2013-10-08,480.94 AAPL,2013-10-07,487.75 AAPL,2013-10-04,483.03 AAPL,2013-10-03,483.41 IBM,2013-09-30,185.18 IBM,2013-09-27,186.92 IBM,2013-09-26,190.22 IBM,2013-09-25,189.47 GOOG,2013-07-19,896.60 GOOG,2013-07-18,910.68 GOOG,2013-07-17,918.55 * */ val keyValue = rawData.map(line => { val tokens = line.split(",") (tokens(0), (tokens(1), tokens(2).toDouble)) }) /* * (GOOG,(2004-11-04,184.7)) (GOOG,(2004-11-03,191.67)) (GOOG,(2004-11-02,194.87)) (AAPL,(2013-10-09,486.59)) (AAPL,(2013-10-08,480.94)) (AAPL,(2013-10-07,487.75)) (AAPL,(2013-10-04,483.03)) (AAPL,(2013-10-03,483.41)) (IBM,(2013-09-30,185.18)) (IBM,(2013-09-27,186.92)) (IBM,(2013-09-26,190.22)) (IBM,(2013-09-25,189.47)) (GOOG,(2013-07-19,896.6)) (GOOG,(2013-07-18,910.68)) (GOOG,(2013-07-17,918.55)) * */ val groupByStockSymbol = keyValue.groupByKey() /* * (IBM,CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47))) (GOOG,CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55))) (AAPL,CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41))) * */ val result = groupByStockSymbol.mapValues(values => { val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd") val sortedValues = values.map(s => (dateFormat.parse(s._1).getTime.toLong, s._2)).toSeq.sortBy(_._1) /* * values:CompactBuffer((2013-09-30,185.18), (2013-09-27,186.92), (2013-09-26,190.22), (2013-09-25,189.47)) sortedValues:List((1380038400000,189.47), (1380124800000,190.22), (1380211200000,186.92), (1380470400000,185.18)) values:CompactBuffer((2004-11-04,184.7), (2004-11-03,191.67), (2004-11-02,194.87), (2013-07-19,896.6), (2013-07-18,910.68), (2013-07-17,918.55)) sortedValues:List((1099324800000,194.87), (1099411200000,191.67), (1099497600000,184.7), (1373990400000,918.55), (1374076800000,910.68), (1374163200000,896.6)) values:CompactBuffer((2013-10-09,486.59), (2013-10-08,480.94), (2013-10-07,487.75), (2013-10-04,483.03), (2013-10-03,483.41)) sortedValues:List((1380729600000,483.41), (1380816000000,483.03), (1381075200000,487.75), (1381161600000,480.94), (1381248000000,486.59)) * * */ val queue = new scala.collection.mutable.Queue[Double]() for (tup <- sortedValues) yield { queue.enqueue(tup._2) if (queue.size > brodcastWindow.value) queue.dequeue (dateFormat.format(new java.util.Date(tup._1)), (queue.sum / queue.size)) } }) /* * (IBM,List((2013-09-25,189.47), (2013-09-26,189.845), (2013-09-27,188.87), (2013-09-30,187.43999999999997))) (GOOG,List((2004-11-02,194.87), (2004-11-03,193.26999999999998), (2004-11-04,190.41333333333333), (2013-07-17,431.64000000000004), (2013-07-18,671.31), (2013-07-19,908.61))) (AAPL,List((2013-10-03,483.41), (2013-10-04,483.22), (2013-10-07,484.73), (2013-10-08,483.9066666666667), (2013-10-09,485.0933333333333))) * */ val formattedResult = result.flatMap(kv => { kv._2.map(v => (kv._1 + "," + v._1 + "," + v._2.toString())) }) /* * IBM,2013-09-25,189.47 IBM,2013-09-26,189.845 IBM,2013-09-27,188.87 IBM,2013-09-30,187.43999999999997 GOOG,2004-11-02,194.87 GOOG,2004-11-03,193.26999999999998 GOOG,2004-11-04,190.41333333333333 GOOG,2013-07-17,431.64000000000004 GOOG,2013-07-18,671.31 GOOG,2013-07-19,908.61 AAPL,2013-10-03,483.41 AAPL,2013-10-04,483.22 AAPL,2013-10-07,484.73 AAPL,2013-10-08,483.9066666666667 AAPL,2013-10-09,485.0933333333333 * */ formattedResult.saveAsTextFile(output) sc.stop() } }