排序是大数据的核心精华.
快排和归并排序更是MapReduce中的精华部分.
以下是归并排序部分.
一、百度百科:
归并排序(MERGE-SORT)是建立在归并操作上的一种有效的排序算法,该算法是采用分治法(Divide and Conquer)的一个非常典型的应用。将已有序的子序列合并,得到完全有序的序列;即先使每个子序列有序,再使子序列段间有序。若将两个有序表合并成一个有序表,称为二路归并。归并排序是一种稳定的排序方法。
二、通俗解释:
三、Show you code:
四、MapReduce中的归并排序源码
package org.apache.hadoop.util;
import java.util.Comparator;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.io.IntWritable;
/** An implementation of the core algorithm of MergeSort. */
@InterfaceAudience.LimitedPrivate({"MapReduce"})
@InterfaceStability.Unstable
public class MergeSort {
//Reusable IntWritables
IntWritable I = new IntWritable(0);
IntWritable J = new IntWritable(0);
//the comparator that the algo should use
private Comparator<IntWritable> comparator;
public MergeSort(Comparator<IntWritable> comparator) {
this.comparator = comparator;
}
public void mergeSort(int src[], int dest[], int low, int high) {
int length = high - low;
// Insertion sort on smallest arrays
if (length < 7) {
for (int i=low; i<high; i++) {
for (int j=i;j > low; j--) {
I.set(dest[j-1]);
J.set(dest[j]);
if (comparator.compare(I, J)>0)
swap(dest, j, j-1);
}
}
return;
}
// Recursively sort halves of dest into src
int mid = (low + high) >>> 1;
mergeSort(dest, src, low, mid);
mergeSort(dest, src, mid, high);
I.set(src[mid-1]);
J.set(src[mid]);
// If list is already sorted, just copy from src to dest. This is an
// optimization that results in faster sorts for nearly ordered lists.
if (comparator.compare(I, J) <= 0) {
System.arraycopy(src, low, dest, low, length);
return;
}
// Merge sorted halves (now in src) into dest
for (int i = low, p = low, q = mid; i < high; i++) {
if (q < high && p < mid) {
I.set(src[p]);
J.set(src[q]);
}
if (q>=high || p<mid && comparator.compare(I, J) <= 0)
dest[i] = src[p++];
else
dest[i] = src[q++];
}
}
private void swap(int x[], int a, int b) {
int t = x[a];
x[a] = x[b];
x[b] = t;
}
}