spark分组求topN

最新推荐文章于 2023-07-04 20:07:29 发布

XLMN

最新推荐文章于 2023-07-04 20:07:29 发布

阅读量876

点赞数

分类专栏： Spark 文章标签： spark分组求topN

本文链接：https://blog.csdn.net/weixin_44701192/article/details/96628322

版权

Spark 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

在mr、hive中投处理过的操作，分组的topn

	比如要从10个文件，每个文件都有100w个数字，找出最大的10数字。

    比如有很多部分，比如研发部、设计部、市场部、行政部等等，要求找出每个部分年龄最小的三个小姐姐。

	这就是分组TopN的问题。

object _03SparkGroupTopNOps {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName(s"${_01SparkSortOps.getClass.getSimpleName}")
.setMaster(“local[2]”)
val sc = new SparkContext(conf)

    val lines = sc.textFile("file:///E:/data/spark/topn.txt")

    val course2Info:RDD[(String, String)] = lines.map(line => {
        val fields = line.split("\\s+")
        val course = fields(0)
        val name = fields(1)
        val score = fields(2)
        (course, s"$name|$score")
    })
    //就需要将每门课程的所有信息弄到一起才能排序
    val course2Infos:RDD[(String, Iterable[String])] = course2Info.groupByKey()
    /*
        排序
           k，是科目
           v：该科目对应的所有的成绩信息
           经过排序之后返回三个人的成绩信息，还是一个集合
           [k, Iterable[String]] --> [k, Iterable[String]]只不过后面Iterable[String]的size为3
           还是one-2-many的操作
           one-2-one--->map
     */
    val top3:RDD[(String, mutable.TreeSet[String])] = course2Infos.map{case (course, infos) => {
        var top3Infos = mutable.TreeSet[String]()(new Ordering[String](){
            //name|score
            override def compare(x: String, y: String) = {
                val xScore = x.substring(x.indexOf("|") + 1).toInt
                val yScore = y.substring(y.indexOf("|") + 1).toInt
                var ret = xScore.compareTo(yScore)
                if(ret == 0) {
                    1
                } else {
                    ret
                }
            }
        })

        //排序的操作 top3Infos是有序的，但是最后只要3个

// top3Infos.dropRight(top3Infos.size - 3)
for(info <- infos) {
top3Infos.add(info)
if(top3Infos.size > 2) {
top3Infos = top3Infos.dropRight(1)
}
}
(course, top3Infos)
}}

    top3.foreach{case (course, infos) => {
        println(s"$course---->$infos")
    }}
    /*
    english---->TreeSet(ww|56, ys|67, mz|77, ts|87, zq|88, gk|96)
    chinese---->TreeSet(sj|74, zl|76, zs|90, ls|91, wb|95, yj|98)
     */
    sc.stop()
}

}


升级：

	因为groupByKey的性能太差了，所以需要使用combineByKey模拟，怎么？

第一步，将groupByKey的处理方式，转化为combineByKey

```scala
object _04SparkGroupTopNOps {
    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
            .setAppName(s"${_04SparkGroupTopNOps.getClass.getSimpleName}")
            .setMaster("local[2]")
        val sc = new SparkContext(conf)

        val lines = sc.textFile("file:///E:/data/spark/topn.txt")

        val course2Info:RDD[(String, String)] = lines.map(line => {
            val fields = line.split("\\s+")
            val course = fields(0)
            val name = fields(1)
            val score = fields(2)
            (course, s"$name|$score")
        })
        //就需要将每门课程的所有信息弄到一起才能排序
        val course2Infos:RDD[(String, ArrayBuffer[String])] = course2Info
            .combineByKey(createCombiner, mergeValue, mergeCombiners)

        val top3:RDD[(String, mutable.TreeSet[String])] = course2Infos.map{case (course, infos) => {
            var top3Infos = mutable.TreeSet[String]()(new Ordering[String](){
                //name|score
                override def compare(x: String, y: String) = {
                    val xScore = x.substring(x.indexOf("|") + 1).toInt
                    val yScore = y.substring(y.indexOf("|") + 1).toInt
                    var ret = xScore.compareTo(yScore)
                    if(ret == 0) {
                        1
                    } else {
                        ret
                    }
                }
            })

            //排序的操作 top3Infos是有序的，但是最后只要3个
//            top3Infos.dropRight(top3Infos.size - 3)
            for(info <- infos) {
                top3Infos.add(info)
                if(top3Infos.size > 2) {
                    top3Infos = top3Infos.dropRight(1)
                }
            }
            (course, top3Infos)
        }}

        top3.foreach{case (course, infos) => {
            println(s"$course---->$infos")
        }}
        /*
        english---->TreeSet(ww|56, ys|67, mz|77, ts|87, zq|88, gk|96)
        chinese---->TreeSet(sj|74, zl|76, zs|90, ls|91, wb|95, yj|98)
         */
        sc.stop()
    }
    def createCombiner(info:String):ArrayBuffer[String] = {
        val ab = ArrayBuffer[String]()
        ab.append(info)
        ab
    }

    def mergeValue(ab:ArrayBuffer[String], info:String):ArrayBuffer[String] = {
        ab.append(info)
        ab
    }
    def mergeCombiners(ab1:ArrayBuffer[String], ab2:ArrayBuffer[String]):ArrayBuffer[String] = {
        ab1 ++ ab2
    }
}

第二步：

第一步和普通的groupByKey并没有什么两样，性能亦然很差，没有没有本地预聚合，所以在重写的时候做一下本地的top3，这样最后做分区间的top3的时候，每个分区最多提供3条记录，这样在网络中传输的数据量少很多，性能得到了提升。

object _05SparkGroupTopNOps {
    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
            .setAppName(s"${_05SparkGroupTopNOps.getClass.getSimpleName}")
            .setMaster("local[2]")
        val sc = new SparkContext(conf)

        val lines = sc.textFile("file:///E:/data/spark/topn.txt")

        val course2Info:RDD[(String, String)] = lines.map(line => {
            val fields = line.split("\\s+")
            val course = fields(0)
            val name = fields(1)
            val score = fields(2)
            (course, s"$name|$score")
        })
        //就需要将每门课程的所有信息弄到一起才能排序
        val course2Infos:RDD[(String, mutable.TreeSet[String])] = course2Info
            .combineByKey(createCombiner, mergeValue, mergeCombiners)


        course2Infos.foreach{case (course, infos) => {
            println(s"$course---->$infos")
        }}
        /*
        english---->TreeSet(ww|56, ys|67, mz|77, ts|87, zq|88, gk|96)
        chinese---->TreeSet(sj|74, zl|76, zs|90, ls|91, wb|95, yj|98)
         */
        sc.stop()
    }
    def createCombiner(info:String):mutable.TreeSet[String] = {
        val ab = mutable.TreeSet[String]()(new Ordering[String](){
            //name|score
            override def compare(x: String, y: String) = {
                val xScore = x.substring(x.indexOf("|") + 1).toInt
                val yScore = y.substring(y.indexOf("|") + 1).toInt
                var ret = xScore.compareTo(yScore)
                if(ret == 0) {
                    1
                } else {
                    ret
                }
            }
        })
        ab.add(info)
        ab
    }
    
    //分区内的排序，并获取top3
    def mergeValue(ab:mutable.TreeSet[String], info:String):mutable.TreeSet[String] = {
        ab.add(info)
        if(ab.size > 3) {
            ab.take(3)
        } else {
            ab
        }
    }
    //分区间合并的时候，获取top3
    def mergeCombiners(ab1:mutable.TreeSet[String], ab2:mutable.TreeSet[String]):mutable.TreeSet[String] = {
        for(info<- ab2) {
            ab1.add(info)
        }
        if(ab1.size > 3) {
            ab1.take(3)
        } else {
            ab1
        }
    }
}

XLMN

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
spark分组求topN

在mr、hive中投处理过的操作，分组的topn 比如要从10个文件，每个文件都有100w个数字，找出最大的10数字。比如有很多部分，比如研发部、设计部、市场部、行政部等等，要求找出每个部分年龄最小的三个小姐姐。这就是分组TopN的问题。object _03SparkGroupTopNOps {def main(args: Array[String]): Uni...
复制链接

扫一扫