数据较大时,不能基于内存,可以基于spark框架实现
x,2,9
y,2,5
x,1,3
y,1,7
y,3,1
x,3,6
z,1,4
z,2,8
z,3,7
z,4,0
p,4,7
p,1,9
p,6,0
p,7,3
package com.gao.mapreduceSpark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* Spark/Scala solution to secondary sort
*
* @author Gaurav Bhardwaj (gauravbhardwajemail@gmail.com)
*
*
*/
object SecondarySort {
def main(args: Array[String]): Unit = {
//
if (args.length != 3) {
println("Usage <number-of-partitions> <input-path> <output-path>")
sys.exit(1)
}
val partitions = args(0).toInt
val inputPath = args(1)
val outputPath = args(2)
val config = new SparkConf
config.setAppName("SecondarySort")
val sc = new SparkContext(config)
val input = sc.textFile(inputPath)
//------------------------------------------------
// each input line/record has the following format:
// <id><,><time><,><value>
//Map name-time自然键
//-------------------------------------------------
val valueToKey = input.map(x => {
val line = x.split(",")
((line(0) + "-" + line(1), line(2).toInt), line(2).toInt)
})
//对规约器的键排序 使用框架插件排序
implicit def tupleOrderingDesc = new Ordering[Tuple2[String, Int]] {
override def compare(x: Tuple2[String, Int], y: Tuple2[String, Int]): Int = {
if (y._1.compare(x._1) == 0) y._2.compare(x._2)
else y._1.compare(x._1)
}
}
val sorted = valueToKey.repartitionAndSortWithinPartitions(new CustomPartitioner(partitions))
val result = sorted.map {
case (k, v) => (k._1, v)
}
result.saveAsTextFile(outputPath)
// done
sc.stop()
}
}
(z-4,0)
(z-3,7)
(z-2,8)
(z-1,4)
(y-3,1)
(y-2,5)
(y-1,7)
(x-3,6)
(x-2,9)
(x-1,3)
(p-7,3)
(p-6,0)
(p-4,7)
(p-1,9)
执行:
./bin/spark-submit --master yarn --deploy-mode cluster --class com.gao.mapreduceSpark.SecondarySort /usr/soft/data/data_algorithms/SparkDemo4-1.0-SNAPSHOT.jar 2 hdfs://ns/mp/secondarySort.txt hdfs://ns/mp/result1