【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter3 Top 10 List

最新推荐文章于 2019-01-11 15:21:36 发布

baibaiw5

最新推荐文章于 2019-01-11 15:21:36 发布

阅读量1k

点赞数

spark 专栏收录该内容

28 篇文章 0 订阅

订阅专栏

：scala版本的Top 10 List

package com.bbw5.dataalgorithms.spark

import scala.collection.mutable.PriorityQueue

import org.apache.spark.Logging
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
 * Assumption: for all input (K, V), K's are unique.
 * This means that there will not etries like (A, 5) and (A, 8).
 *
 * This class implements Top-N design pattern for N > 0.
 * This class may be used to find bottom-N as well (by
 * just keeping N-smallest elements in the set.
 *
 *  Top-10 Design Pattern: “Top Ten” Structure
 *
 *    class mapper :
 *         setup(): initialize top ten sorted list
 *         map(key, record ):
 *                       Insert record into top ten sorted list if length of array
 *                       is greater than 10.
 *                       Truncate list to a length of 10.
 *         cleanup() : for record in top sorted ten list: emit null, record
 *
 *    class reducer:
 *               setup(): initialize top ten sorted list
 *               reduce(key, records): sort records
 *                                     truncate records to top 10
 *                                     for record in records: emit record
 * cat id,cat name,cat weight
 * 1,cat1,13
 * 2,cat2,10
 * 3,cat3,14
 * 4,cat4,13
 * 5,cat5,20
 * 6,cat6,24
 * 7,cat7,13
 * 8,cat8,10
 * 9,cat9,24
 * 10,cat10,13
 * 11,cat11,30
 * 12,cat12,14
 *
 * @author bbw5
 *
 */
object SparkTopNList extends Logging {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("SparkTopNList")
    val sc = new SparkContext(sparkConf)

    val filename = "D:/temp/data/top1.txt"
    val textFile = sc.textFile(filename)

    //保存(cat weight,cat name),最小的weight放在Queue最上面
    val finalTopN = new PriorityQueue[(Int, String)]()(Ordering.by[(Int, String), Int](-_._1))
    val topN = 5
    val bTopN = sc.broadcast(topN)
    val topNRDD = textFile.map(_.split(",")).map(d =>
      (d(0), d(1), d(2).toInt)).mapPartitions(items => {
      //先分区内获取TopN
      val topNQueue = new PriorityQueue[(Int, String)]()(Ordering.by[(Int, String), Int](-_._1))
      items.foreach {
        case (id, name, weight) =>
          topNQueue += weight -> name
          if (topNQueue.size > bTopN.value) {
            //Queue中保存topN个数
            topNQueue.dequeue
          }
      }
      topNQueue.iterator
    })
    topNRDD.collect().foreach { item =>
      finalTopN += item
      if (finalTopN.size > topN) {
        finalTopN.dequeue
      }
    }
    //验证输出
    finalTopN.foreach(println)

  }
}