:scala版本的Top 10 List
package com.bbw5.dataalgorithms.spark
import scala.collection.mutable.PriorityQueue
import org.apache.spark.Logging
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* Assumption: for all input (K, V), K's are unique.
* This means that there will not etries like (A, 5) and (A, 8).
*
* This class implements Top-N design pattern for N > 0.
* This class may be used to find bottom-N as well (by
* just keeping N-smallest elements in the set.
*
* Top-10 Design Pattern: “Top Ten” Structure
*
* class mapper :
* setup(): initialize top ten sorted list
* map(key, record ):
* Insert record into top ten sorted list if length of array
* is greater than 10.
* Truncate list to a length of 10.
* cleanup() : for record in top sorted ten list: emit null, record
*
* class reducer:
* setup(): initialize top ten sorted list
* reduce(key, records): sort records
* truncate records to top 10
* for record in records: emit record
* cat id,cat name,cat weight
* 1,cat1,13
* 2,cat2,10
* 3,cat3,14
* 4,cat4,13
* 5,cat5,20
* 6,cat6,24
* 7,cat7,13
* 8,cat8,10
* 9,cat9,24
* 10,cat10,13
* 11,cat11,30
* 12,cat12,14
*
* @author bbw5
*
*/
object SparkTopNList extends Logging {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkTopNList")
val sc = new SparkContext(sparkConf)
val filename = "D:/temp/data/top1.txt"
val textFile = sc.textFile(filename)
//保存(cat weight,cat name),最小的weight放在Queue最上面
val finalTopN = new PriorityQueue[(Int, String)]()(Ordering.by[(Int, String), Int](-_._1))
val topN = 5
val bTopN = sc.broadcast(topN)
val topNRDD = textFile.map(_.split(",")).map(d =>
(d(0), d(1), d(2).toInt)).mapPartitions(items => {
//先分区内获取TopN
val topNQueue = new PriorityQueue[(Int, String)]()(Ordering.by[(Int, String), Int](-_._1))
items.foreach {
case (id, name, weight) =>
topNQueue += weight -> name
if (topNQueue.size > bTopN.value) {
//Queue中保存topN个数
topNQueue.dequeue
}
}
topNQueue.iterator
})
topNRDD.collect().foreach { item =>
finalTopN += item
if (finalTopN.size > topN) {
finalTopN.dequeue
}
}
//验证输出
finalTopN.foreach(println)
}
}