spark 二次排序 文件数据较大情况下

数据较大时,不能基于内存,可以基于spark框架实现

x,2,9
y,2,5
x,1,3
y,1,7
y,3,1
x,3,6
z,1,4
z,2,8
z,3,7
z,4,0
p,4,7
p,1,9
p,6,0
p,7,3
package com.gao.mapreduceSpark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
 * Spark/Scala solution to secondary sort
 *
 * @author Gaurav Bhardwaj (gauravbhardwajemail@gmail.com)
 * 
 *
 */
object SecondarySort {
  
  def main(args: Array[String]): Unit = {
    //
    if (args.length != 3) {
      println("Usage <number-of-partitions> <input-path> <output-path>")
      sys.exit(1)
    }

    val partitions = args(0).toInt
    val inputPath = args(1)
    val outputPath = args(2)

    val config = new SparkConf
    config.setAppName("SecondarySort")
    val sc = new SparkContext(config)
    val input = sc.textFile(inputPath)

    //------------------------------------------------
    // each input line/record has the following format:
    // <id><,><time><,><value>
    //Map name-time自然键
    //-------------------------------------------------
    val valueToKey = input.map(x => {
      val line = x.split(",")
      ((line(0) + "-" + line(1), line(2).toInt), line(2).toInt)
    })
    //对规约器的键排序 使用框架插件排序
    implicit def tupleOrderingDesc = new Ordering[Tuple2[String, Int]] {
      override def compare(x: Tuple2[String, Int], y: Tuple2[String, Int]): Int = {
        if (y._1.compare(x._1) == 0) y._2.compare(x._2)
        else y._1.compare(x._1)
      }
    }

    val sorted = valueToKey.repartitionAndSortWithinPartitions(new CustomPartitioner(partitions))

    val result = sorted.map {
      case (k, v) => (k._1, v)
    }

    result.saveAsTextFile(outputPath)

    // done
    sc.stop()
  }
}
(z-4,0)
(z-3,7)
(z-2,8)
(z-1,4)
(y-3,1)
(y-2,5)
(y-1,7)
(x-3,6)
(x-2,9)
(x-1,3)
(p-7,3)
(p-6,0)
(p-4,7)
(p-1,9)

执行:

./bin/spark-submit --master yarn --deploy-mode cluster --class com.gao.mapreduceSpark.SecondarySort  /usr/soft/data/data_algorithms/SparkDemo4-1.0-SNAPSHOT.jar 2 hdfs://ns/mp/secondarySort.txt hdfs://ns/mp/result1

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值