一、MapReduce/Hadoop的二次排序解决方案(点击打开)
二、Spark的二次排序解决方案
方案1:同(一)的方案1,将一个给定键的所有值读取并缓存到一个List数组(Array)数据结构中,然后对这些值进行排序。如果内存不够放,则无法实现
方案2:同(一)的方案2,“会为自然键增加部分或整个值来创建一个组合键以实现排序目标”
三、代码实现(一):使用Spark Java API 中的groupByKey实现方案1(Java原始语法、lambda语法)
package ercipaixu_spark1;
// STEP-0: import required Java/Spark classes.
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
//
import scala.Tuple2;
//
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.Function2;
//
/**
* SecondarySortUsingCombineByKey class implements the secondary sort design pattern
* by using combineByKey().
*
*
* Input:
*
* name, time, value
* x,2,9
* y,2,5
* x,1,3
* y,1,7
* y,3,1
* x,3,6
* z,1,4
* z,2,8
* z,3,7
* z,4,0
* p,1,10
* p,3,60
* p,4,40
* p,6,20
*
* Output: generate a time-series looking like this:
*
* t1 t2 t3 t4 t5 t6
* x => [3, 9, 6]
* y => [7, 5, 1]
* z => [4, 8, 7, 0]
* p => [10, null, 60, 40, null , 20]
*
* x => [(1,3), (2,9), (3,6)] where 1 < 2 < 3
* y => [(1,7), (2,5), (3,1)] where 1 < 2 < 3
* z => [(1,4), (2,8), (3,7), (4,0)] where 1 < 2 < 3 < 4
* p => [(1,10), (3,60), (4,40), (6,20)] where 1 < 3 < 4 < 6
*
* @author Mahmoud Parsian
*
*/
public class SecondarySortUsingCombineByKey {
public static void main(String[] args) throws Exception {
args = new String[2];
args[0] = "/media/chenjie/0009418200012FF3/ubuntu/sample_input.txt";
args[1] = "/media/chenjie/0009418200012FF3/ubuntu/sample_output";
// STEP-1: read input parameters and validate them
if (args.length < 2) {
System.err.println("Usage: SecondarySortUsingCombineByKey <input> <output>");
System.exit(1);
}
String inputPath = args[0];
System.out.println("inputPath=" + inputPath);
String outputPath = args[1];
System.out.println("outputPath=" + outputPath);
// STEP-2: Connect to the Sark master by creating JavaSparkContext object
final JavaSparkContext ctx = SparkUtil.createJavaSparkContext("local","spark1");
// STEP-3: Use ctx to create JavaRDD<String>
// input record format: <name><,><time><,><value>
JavaRDD<String> lines = ctx.textFile(inputPath, 1);
// STEP-4: create (key, value) pairs from JavaRDD<String> where
// key is the {name} and value is a pair of (time, value).
// The resulting RDD will be JavaPairRDD<String, Tuple2<Integer, Integer>>.
// convert each record into Tuple2(name, time, value)
// PairFunction<T, K, V> T => Tuple2(K, V) where K=String and V=Tuple2<Integer, Integer>
// input K V
System.out.println("=== DEBUG STEP-4 ===");
JavaPairRDD<String, Tuple2<Integer, Integer>> pairs = lines.mapToPair(new PairFunction<String, String, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<String, Tuple2<Integer, Integer>> call(String s) {
String[] tokens = s.split(","); // x,2,5
System.out.println(tokens[0] + "," + tokens[1] + "," + tokens[2]);
Tuple2<Integer, Integer> timevalue = new Tuple2<Integer, Integer>(Integer.parseInt(tokens[1]), Integer.parseInt(tokens[2]));
return new Tuple2<String, Tuple2<Integer, Integer>>(tokens[0], timevalue);
}
});
// STEP-5: validate STEP-4, we collect all values from JavaPairRDD<> and print it.
List<Tuple2<String, Tuple2<Integer, Integer>>> output = pairs.collect();
for (Tuple2 t : output) {
Tuple2<Integer, Integer> timevalue = (Tuple2<Integer, Integer>) t._2;
System.out.println(t._1 + "," + timevalue._1 + "," + timevalue._1);
}
// How to use combineByKey(): to use combineByKey(), you
// need to define 3 basic functions f1, f2, f3:
// and then you invoke it as: combineByKey(f1, f2, f3)
// function 1: create a combiner data structure
// function 2: merge a value into a combined data structure
// function 3: merge two combiner data structures
// function 1: create a combiner data structure
// Here, the combiner data structure is a SortedMap<Integer,Integer>,
// which keeps track of (time, value) for a given key
// Tuple2<Integer, Integer> = Tuple2<time, value>
// SortedMap<Integer, Integer> = SortedMap<time, value>
Function<Tuple2<Integer, Integer>, SortedMap<Integer, Integer>> createCombiner
= new Function<Tuple2<Integer, Integer>, SortedMap<Integer, Integer>>() {
@Override
public SortedMap<Integer, Integer> call(Tuple2<Integer, Integer> x) {
Integer time = x._1;
Integer value = x._2;
SortedMap<Integer, Integer> map = new TreeMap<>();
map.put(time, value);
return map;
}
};
// function 2: merge a value into a combined data structure
Function2<SortedMap<Integer, Integer>, Tuple2<Integer, Integer>, SortedMap<Integer, Integer>> mergeValue
= new Function2<SortedMap<Integer, Integer>, Tuple2<Integer, Integer>, SortedMap<Integer, Integer>>() {
@Override
public SortedMap<Integer, Integer> call(SortedMap<Integer, Integer> map, Tuple2<Integer, Integer> x) {
Integer time = x._1;
Integer value = x._2;
map.put(time, value);
return map;
}
};
// function 3: merge two combiner data structures
Function2<SortedMap<Integer, Integer>, SortedMap<Integer, Integer>, SortedMap<Integer, Integer>> mergeCombiners
= new Function2<SortedMap<Integer, Integer>, SortedMap<Integer, Integer>, SortedMap<Integer, Integer>>() {
@Override
public SortedMap<Integer, Integer> call(SortedMap<Integer, Integer> map1, SortedMap<Integer, Integer> map2) {
if (map1.size() < map2.size()) {
return DataStructures.merge(map1, map2);
} else {
return DataStructures.merge(map1, map2);
}
}
};
// STEP-5: create sorted (time, value)
JavaPairRDD<String, SortedMap<Integer, Integer>> combined = pairs.combineByKey(
createCombiner,
mergeValue,
mergeCombiners);
// STEP-7: validate STEP-6, we collect all values from JavaPairRDD<> and print it.
System.out.println("=== DEBUG STEP-6 ===");
List<Tuple2<String, SortedMap<Integer, Integer>>> output2 = combined.collect();
for (Tuple2<String, SortedMap<Integer, Integer>> t : output2) {
String name = t._1;
SortedMap<Integer, Integer> map = t._2;
System.out.println(name);
System.out.println(map);
}
// persist output
combined.saveAsTextFile(outputPath);
// done!
ctx.close();
// exit
System.exit(0);
}
}
其中用到的比较器类,因为我们是按时间进行比较,所以比较(int,int)的第一个分量
package ercipaixu_spark1;
import scala.Tuple2;
import java.util.Comparator;
import java.io.Serializable;
/**
* The SparkTupleComparator class enable us to compare two
* Tuple2<Integer, Integer> objects based on the first Tuple2
* argument.
*
* @author Mahmoud Parsian
*
*/
public class SparkTupleComparator
implements Comparator<Tuple2<Integer, Integer>>, Serializable {
public static final SparkTupleComparator INSTANCE = new SparkTupleComparator();
private SparkTupleComparator() {
}
@Override
public int compare(Tuple2<Integer, Integer> t1, Tuple2<Integer, Integer> t2){
return t1._1.compareTo(t2._1);
}
}
四、代码实现(二):使用Spark Java API 中的combineByKey实现方案1(Java原始语法、lambda语法)
五、代码实现(三):使用Spark Java API 中的repartitionAndSortWithinPartitions实现方案1
六、代码实现(四):使用Scala语言实现方案2
import org.apache.spark.Partitioner
/***
* 自定义Partitioner
* @param partitions
*/
class CustomPartitioner(partitions: Int) extends Partitioner {
require(partitions > 0, s"Number of partitions ($partitions) cannot be negative.")
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case (k: String, v: Int) => math.abs(k.hashCode % numPartitions)
case null => 0
case _ => math.abs(key.hashCode % numPartitions)
}
override def equals(other: Any): Boolean = other match {
case h: CustomPartitioner => h.numPartitions == numPartitions
case _ => false
}
override def hashCode: Int = numPartitions
}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object SecondarySort {
def main(args: Array[String]): Unit = {
val partitions = 1//分区数为1
val inputPath = "file:///media/chenjie/0009418200012FF3/ubuntu/sample_input.txt"//输入文件路径
val outputPath = "file:///media/chenjie/0009418200012FF3/ubuntu/sample_output"//输出文件路径
val conf = new SparkConf().setAppName("CJResult").setMaster("local")//设置Spark在本地运行
val sc = new SparkContext(conf)//新建sc
val input = sc.textFile(inputPath)//读取输入文件
val valueToKey = input.map(x => {
val line = x.split(",")
((line(0) + "-" + line(1), line(2).toInt), line(2).toInt)
})//将输入文件映射为((名字-时间,值),值)的复合键值对集合,其中的键也为键值对
implicit def tupleOrderingDesc = new Ordering[Tuple2[String, Int]] {
override def compare(x: Tuple2[String, Int], y: Tuple2[String, Int]): Int = {
if (y._1.compare(x._1) == 0) y._2.compare(x._2)
else y._1.compare(x._1)
}
}//定义一个隐式转换,对于(String,Int)类型的键值对(二元组),将其按照:如果键不相同按照键排序,如果键相同再按值排序
//其实就是将上面的(名字-时间,值)进行排序
val sorted = valueToKey.repartitionAndSortWithinPartitions(new CustomPartitioner(partitions))
val result = sorted.map {
case (k, v) => (k._1, v)
}//将((名字-时间,值),值)转为(名字-时间,值)
result.saveAsTextFile(outputPath)//将结果写入文件
sc.stop()
}
}