关于Timsort原理部分可以看一下
今天分析一下java的timsort相关的实现。
java中的Arrays工具类的静态方法sort():
public static void sort(Object[] a, int fromIndex, int toIndex) {
/**
*首先进行边界检查,可能会报出IllegalArgumentException(fromIndex>toIndex)
*或者ArrayIndexOutOfBoundsException异常
*/
rangeCheck(a.length, fromIndex, toIndex);
//如果要求使用legacyMergeSort,则优先使用这个排序方法,不是本文的重点
if (LegacyMergeSort.userRequested)
legacyMergeSort(a, fromIndex, toIndex);
else
//下面的方法就是TimSort,跳进去看一下
ComparableTimSort.sort(a, fromIndex, toIndex, null, 0, 0);
}
ComparaleTimSort直接继承Object类,并有如下的域和方法:
直接看sort方法:
static void sort(Object[] a, int lo, int hi, Object[] work, int workBase, int workLen) {
//保证输入的数据的正确性
assert a != null && lo >= 0 && lo <= hi && hi <= a.length;
int nRemaining = hi - lo;
if (nRemaining < 2)
return; // Arrays of size 0 and 1 are always sorted
/**
*
*静态常量MIN_MERGE定义为private static final int MIN_MERGE = 32;
*就是说当待排序数组的长度小于32时就是用二分插入排序
*/
if (nRemaining < MIN_MERGE) {
/**
*此方法对a从lo开始进行排序,排序的结果是从a[lo]开始
*
* a[lo] <= a[lo + 1] <= a[lo + 2] <= ...
*
* 或者
*
* a[lo] > a[lo + 1] > a[lo + 2] > ...
*
* 并且返回这样的一个序列的长度
*这里就对应一个run,保证了数组的局部有序,便于后面使用merge方法
*/
int initRunLen = countRunAndMakeAscending(a, lo, hi);
//对于小型数组最好的排序方法
binarySort(a, lo, hi, lo + initRunLen);
return;
}
/**
* March over the array once, left to right, finding natural runs,
* extending short natural runs to minRun elements, and merging runs
* to maintain stack invariant.
*/
//这个时候才开始真正的timsort
ComparableTimSort ts = new ComparableTimSort(a, work, workBase, workLen);
//调用minRunLegth(int)方法求minRunLength
int minRun = minRunLength(nRemaining);
do {
// Identify next run
//确定下一个递增或者严格递减的区间
int runLen = countRunAndMakeAscending(a, lo, hi);
//区间长度不满足要求,即小于minrun,则对该区间进行二分插入,使得长度满足要求
if (runLen < minRun) {
int force = nRemaining <= minRun ? nRemaining : minRun;
binarySort(a, lo, lo + force, lo + runLen);
runLen = force;
}
// 将当前run信息压入栈(就是两个一维的数组和一个记录当前数组中元素个数的int,两个一维数组分别存储基址runbase和长度runlength)中,
ts.pushRun(lo, runLen);
//合并栈中的run(满足合并要求),使得栈中的run长度存在一个指数下降的关系,这样的话在合并的时候相邻的两个run的runsize相近,进行merge操作时高效
ts.mergeCollapse();
// Advance to find next run
lo += runLen;
nRemaining -= runLen;
} while (nRemaining != 0);
// Merge all remaining runs to complete sort
assert lo == hi;
ts.mergeForceCollapse();
//保证合并成功,否则抛出错误AssertionError
assert ts.stackSize == 1;
}
下面看一下合并run的操作
/**
*合并栈中的run(满足合并要求),使得栈中的run长度存在一个指数下降的关系,这样的话在合并的时候相邻的两个run的*runsize相近,进行merge操作时高效
*/
private void mergeCollapse() {
while (stackSize > 1) {
int n = stackSize - 2;
if (n > 0 && runLen[n-1] <= runLen[n] + runLen[n+1]) {
if (runLen[n - 1] < runLen[n + 1])
n--;
mergeAt(n);
} else if (runLen[n] <= runLen[n + 1]) {
mergeAt(n);
} else {
break; // Invariant is established
}
}
}
其中调用了mergeAt函数
//合并栈中的第i个run和第i+1个run
private void mergeAt(int i) {
assert stackSize >= 2;
assert i >= 0;
assert i == stackSize - 2 || i == stackSize - 3;
int base1 = runBase[i];
int len1 = runLen[i];
int base2 = runBase[i + 1];
int len2 = runLen[i + 1];
assert len1 > 0 && len2 > 0;
assert base1 + len1 == base2;
/*
* Record the length of the combined runs; if i is the 3rd-last
* run now, also slide over the last run (which isn't involved
* in this merge). The current run (i+1) goes away in any case.
*/
//新的合并的run的长度
runLen[i] = len1 + len2;
if (i == stackSize - 3) {
runBase[i + 1] = runBase[i + 2];
runLen[i + 1] = runLen[i + 2];
}
stackSize--;
/*
* Find where the first element of run2 goes in run1. Prior elements
* in run1 can be ignored (because they're already in place).
*/
int k = gallopRight((Comparable<Object>) a[base2], a, base1, len1, 0);
assert k >= 0;
base1 += k;
len1 -= k;
if (len1 == 0)
return;
/*
* Find where the last element of run1 goes in run2. Subsequent elements
* in run2 can be ignored (because they're already in place).
*/
len2 = gallopLeft((Comparable<Object>) a[base1 + len1 - 1], a,
base2, len2, len2 - 1);
assert len2 >= 0;
if (len2 == 0)
return;
// Merge remaining runs, using tmp array with min(len1, len2) elements
if (len1 <= len2)
mergeLo(base1, len1, base2, len2);
else
mergeHi(base1, len1, base2, len2);
}
其中插入使用了gallopMode进行插入排序,(这部分的代码明天再看)
分为gallopLeft和gallopRight;
下面的函数定位应该将key插入到数组a中从base开始,长度为len的子数组中的哪个位置;如果数组中存在相等的值,则返回最左的那个index(翻译源码注释),hint表示在a[base+hint]位置附近开始搜索key;
算法思路如下 :(其实整理为一个流程图更好)
1.初始化变量lastofs=0、ofs=1;判断key是否大于a[base+hint],大于的话跳转至2(这一步就可以确定接下来的搜索范围是[[base+hint,len]),否则跳转至4(这一步就可以确定接下来的搜索范围是[base,base+hint])
2.判断key是否大于a[base+hint+ofs],若大于则
lastofs=ofs;
ofs=(ofs<<1)+1;//指数变化
并跳转至2;若小于则跳转至3
3.进行赋值
lastOfs += hint;
ofs += hint;
此时可以确定这样的一个关系,a[base+lastOfs] < key <= a[base+ofs];跳转至6;
4…判断key是否小于等于a[base+hint-ofs],若小于等于则
lastOfs = ofs;
ofs = (ofs << 1) + 1;
并跳转至2,若大于则跳转至5
5.进行赋值
lastOfs = hint - ofs;
ofs = hint - tmp;
此时可以确定这样的一个关系,a[base+lastOfs] < key <= a[base+ofs];跳转至6;
6.利用二分查找寻找key插入的值;
下面的是源码
private static int gallopLeft(Comparable<Object> key, Object[] a,
int base, int len, int hint) {
assert len > 0 && hint >= 0 && hint < len;
int lastOfs = 0;
int ofs = 1;
if (key.compareTo(a[base + hint]) > 0) {
// Gallop right until a[base+hint+lastOfs] < key <= a[base+hint+ofs]
int maxOfs = len - hint;
while (ofs < maxOfs && key.compareTo(a[base + hint + ofs]) > 0) {
lastOfs = ofs;
ofs = (ofs << 1) + 1;
if (ofs <= 0) // int overflow
ofs = maxOfs;
}
if (ofs > maxOfs)
ofs = maxOfs;
// Make offsets relative to base
lastOfs += hint;
ofs += hint;
} else { // key <= a[base + hint]
// Gallop left until a[base+hint-ofs] < key <= a[base+hint-lastOfs]
final int maxOfs = hint + 1;
while (ofs < maxOfs && key.compareTo(a[base + hint - ofs]) <= 0) {
lastOfs = ofs;
ofs = (ofs << 1) + 1;
if (ofs <= 0) // int overflow
ofs = maxOfs;
}
if (ofs > maxOfs)
ofs = maxOfs;
// Make offsets relative to base
int tmp = lastOfs;
lastOfs = hint - ofs;
ofs = hint - tmp;
}
assert -1 <= lastOfs && lastOfs < ofs && ofs <= len;
/*
* Now a[base+lastOfs] < key <= a[base+ofs], so key belongs somewhere
* to the right of lastOfs but no farther right than ofs. Do a binary
* search, with invariant a[base + lastOfs - 1] < key <= a[base + ofs].
*/
lastOfs++;
while (lastOfs < ofs) {
int m = lastOfs + ((ofs - lastOfs) >>> 1);
if (key.compareTo(a[base + m]) > 0)
lastOfs = m + 1; // a[base + m] < key
else
ofs = m; // key <= a[base + m]
}
assert lastOfs == ofs; // so a[base + ofs - 1] < key <= a[base + ofs]
return ofs;
}