1. MapReduce实现分组、排序
1.1分组
//在MapReduce 主类设置自定义分组Class
job.setGroupingComparatorClass((Class<? extends RawComparator> cls);
//实例
job.setGroupingComparatorClass(S1apEnbidGroupingComparator.class);
1.2 排序
//自定义二次排序策略
job.setSortComparatorClass(Class<? extends RawComparator> cls)
//实例
job.setSortComparatorClass(S1apEnbidSortComparator.class);
2.Java版Spark实现分组、排序
//分组
JavaPairRDD.groupByKey(reducenum);
key重写hashCode()和equals()
//value排序
groupByKey.mapToPair(new PairFunction<Tuple2<SparkLocatorCombinedKey,Iterable<String>>, SparkLocatorCombinedKey, Iterable<String>>() {
private static final long serialVersionUID = 8988893168013930479L;
@Override
public Tuple2<SparkLocatorCombinedKey, Iterable<String>> call(
Tuple2<SparkLocatorCombinedKey, Iterable<String>> tuple2) throws Exception {
List<String> list = copyIterator(tuple2._2.iterator());
Collections.sort(list, new SecondSortByTimeComparator());
return new Tuple2<SparkLocatorCombinedKey, Iterable<String>>(tuple2._1, list);
}
});
//Iterable => Iterator => List
public static <T> List<T> copyIterator(Iterator<T> iter) {
List<T> copy = new ArrayList<T>();
while (iter.hasNext())
copy.add(iter.next());
return copy;
}
// List => Iterable 多态
//SecondSortByTimeComparator
3.scala版Spark实现分组、排序
//分组 与Java一样 也是hashCode()和equals()
rdd.groupByKey(reduceNum)
//value排序
rdd.mapValues(iterable => {
iterable.toList.sortBy(sortRule)
})
//sortRule
def sortRule(employee: Employee): (Long, String) = {
(employee.getTimeStamp, employee.getEmployeeID)
}