package com.gao.mapreduceSpark
import org.apache.spark.{SparkConf, SparkContext}
object SecondarySortByMemory {
def main(args: Array[String]): Unit = {
//
if (args.length != 2) {
println("Usage <input-path> <output-path>")
sys.exit(1)
}
val inputPath = args(0)
val outputPath = args(1)
val config = new SparkConf
config.setAppName("SecondarySortByMemory")
val sc = new SparkContext(config)
val input = sc.textFile(inputPath)
val result = input.map{line => {
val infos = line.split(",")
val name = infos(0)
val time = infos(1)
val value = infos(2)
(name,(time,value))
}}.sortBy(_._2).groupByKey()
result.saveAsTextFile(outputPath)
// done
sc.stop()
}
}
执行代码如下:
./bin/spark-submit --master yarn --deploy-mode cluster --class com.gao.mapreduceSpark.SecondarySortByMemory /usr/soft/data/data_algorithms/SparkDemo4-1.0-SNAPSHOT.jar hdfs://ns/mp/secondarySort.txt hdfs://ns/mp/result
原始文件
name,time,value
x,2,9
y,2,5
x,1,3
y,1,7
y,3,1
x,3,6
z,1,4
z,2,8
z,3,7
z,4,0
p,4,7
p,1,9
p,6,0
p,7,3
结果:
(z,CompactBuffer((1,4), (2,8), (3,7), (4,0)))
(p,CompactBuffer((1,9), (4,7), (6,0), (7,3)))
(x,CompactBuffer((1,3), (2,9), (3,6)))
(y,CompactBuffer((1,7), (2,5), (3,1)))
实现思路:
组成元组:(name,(time,value))
先.sortBy(_._2)
再groupByKey()