类似于"后缀数组的自底向上(bottom-up)遍历算法",也可以对Enhanced Suffix Array执行自顶向下(top-down)遍历。这里通过增强信息--lcptable和childtab--来自顶向下遍历后缀数组。每一个后缀的childtab元素包含up,down和nexLIndex三项。up或down值指向某个lcp-interval的第一个l-index;nextLIndex指向下一个l-index,这样所有的孩子区间都能在childtab的基础上得出:设某个lcp-interval[i..j]的l-indices为i1<i2<...<ik,那么第一个孩子区间为[i..i1-1],其他的孩子区间为[i1..i2-1], ... [ik...j];并且childtab[i].down和childtab[j+1].up两个值之间至少有一个值等于第一个l-index,其他的l-index可以通过childtab[i1].nextLIndex得出。计算childtab的算法参考“Replacing suffix tree with enhanced suffix arrays”。
实现:
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
/**
*
* Top-down Traversal of a suffix array (with lcptable and childtab)
* (The suffix array is constructed with DC3 algorithm)
*
*
* Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
* Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php)
*
* @author ljs
* 2011-07-25
*
*/
public class ESA_TopDownTraversal {
public class LIndexInfo {
private int up=-1;
private int down=-1;
private int nextLIndex = -1;
public String toString(){
return up + ":" + down + ":" + nextLIndex;
}
}
private String text;
private int[] lcptable;
private LIndexInfo[] childtab;
private int len;
public ESA_TopDownTraversal(String text){
this.text = text;
this.len = text.length();
childtab=new LIndexInfo[len];
for(int i=0;i<len;i++){
childtab[i] = new LIndexInfo();
}
}
public static final char MAX_CHAR = '\u00FF';
class Suffix{
int[] sa;
//Note: the p-th suffix in sa: SA[rank[p]-1]];
//p is the index of the array "rank", start with 0;
//a text S's p-th suffix is S[p..n], n=S.length-1.
int[] rank;
boolean done;
public Suffix(int[] sa,int[] rank){
this.sa = sa;
this.rank = rank;
}
}
//a prefix of suffix[isuffix] represented with digits
class Tuple{
int isuffix; //the p-th suffix
int[] digits;
public Tuple(int suffix,int[] digits){
this.isuffix = suffix;
this.digits = digits;
}
public String toString(){
StringBuffer sb = new StringBuffer();
sb.append(isuffix);
sb.append("(");
for(int i=0;i<digits.length;i++){
sb.append(digits[i]);
if(i<digits.length-1)
sb.append("-");
}
sb.append(")");
return sb.toString();
}
}
//d: the digit to do countingsort
//max: A value's range is 0...max
private void countingSort(int d,Tuple[] tA,Tuple[] tB,int max){
//init the counter array
int[] C = new int[max+1];
for(int i=0;i<=max;i++){
C[i] = 0;
}
//stat the count
for(int j=0;j<tA.length;j++){
C[tA[j].digits[d]]++;
}
//process the counter array C
for(int i=1;i<=max;i++){
C[i]+=C[i-1];
}
//distribute the values
for(int j=tA.length-1;j>=0;j--){
//C[A[j]] <= A.length
tB[--C[tA[j].digits[d]]]=tA[j];
}
}
//tA: input
//tB: output for rank caculation
private void radixSort(Tuple[] tA,Tuple[] tB,int max,int digitsLen){
int len = tA.length;
int digitsTotalLen = tA[0].digits.length;
for(int d=digitsTotalLen-1,j=0;j<digitsLen;d--,j++){
this.countingSort(d, tA, tB, max);
//assign tB to tA
if(j<digitsLen-1){
for(int i=0;i<len;i++){
tA[i] = tB[i];
}
}
}
}
//max is the maximum value in any digit of TA.digits[], used for counting sort
//tA: input
//tB: the place holder, reused between iterations
private Suffix rank(Tuple[] tA,Tuple[] tB,int max,int digitsLen){
int len = tA.length;
radixSort(tA,tB,max,digitsLen);
int digitsTotalLen = tA[0].digits.length;
//caculate rank and sa
int[] sa = new int[len];
sa[0] = tB[0].isuffix;
int[] rank = new int[len+2]; //add 2 for sentinel
rank[len]=1;rank[len+1] = 1;
int r = 1; //rank starts with 1
rank[tB[0].isuffix] = r;
for(int i=1;i<len;i++){
sa[i] = tB[i].isuffix;
boolean equalLast = true;
for(int j=digitsTotalLen-digitsLen;j<digitsTotalLen;j++){
if(tB[i].digits[j]!=tB[i-1].digits[j]){
equalLast = false;
break;
}
}
if(!equalLast){
r++;
}
rank[tB[i].isuffix] = r;
}
Suffix suffix = new Suffix(sa,rank);
//judge if we are done
if(r==len){
suffix.done = true;
}else{
suffix.done = false;
}
return suffix;
}
private int[] orderSuffixes(Tuple[] tA,Tuple[] tB,int max,int digitsLen){
int len = tA.length;
radixSort(tA,tB,max,digitsLen);
//caculate rank and sa
int[] sa = new int[len];
for(int i=0;i<len;i++){
sa[i] = tB[i].isuffix;
}
return sa;
}
//rank needs sentinel: len+2
private Suffix reduce(int[] rank,int max){
int len = rank.length - 2;
int n1 = (len+1)/3;
int n2 = len/3;
Tuple[] tA = new Tuple[n1+n2];
Tuple[] tB = new Tuple[n1+n2];
for(int i=0,j=1;i<n1;i++,j+=3){
int r1 = rank[j];
int r2 = rank[j+1];
int r3 = rank[j+2];
tA[i] = new Tuple(i,new int[]{r1,r2,r3});
}
for(int i=n1,j=2;i<n1+n2;i++,j+=3){
int r1 = rank[j];
int r2 = rank[j+1];
int r3 = rank[j+2];
tA[i] = new Tuple(i,new int[]{r1,r2,r3});
}
return rank(tA,tB,max,3);
}
private int[] skew(int[] rank,int max){
int len = rank.length - 2;
//step 1: caculate sa12
Suffix suffixT12 = reduce(rank,max);
int[] sa12 = null;
if(!suffixT12.done){
int[] rankT12 = suffixT12.rank;
int maxT12 = rankT12[suffixT12.sa[suffixT12.sa.length-1]];
sa12 = skew(rankT12,maxT12);
// debug for string: GACCCACCACC#
//s12 = new Suffix();
//s12.rank = new int[]{3,6,5,4,7,2,1,1,1};
//s12.sa = new int[]{7,6,5,0,3,2,1,4};
//s12.done =true;
}else{
sa12 = suffixT12.sa;
}
//index conversion for sa12
int n1 = (len+1)/3;
for(int j=0;j<sa12.length;j++){
if(sa12[j]<n1){
sa12[j] = 1 + 3*sa12[j];
}else{
sa12[j] = 2 + 3*(sa12[j]-n1);
}
}
//recaculate rank for sa12
int[] rank12 = new int[len+2];
rank12[len] = 1;rank12[len+1] = 1;
for(int k=0;k<sa12.length;k++){
rank12[sa12[k]] = k+1;
}
//step 2: caculate sa0
int n0=(len+2)/3;
Tuple[] tA = new Tuple[n0];
Tuple[] tB = new Tuple[n0];
for(int i=0,j=0;i<n0;i++,j+=3){
int r1 = rank[j];
int r2 = rank12[j+1];
tA[i] = new Tuple(i,new int[]{r1,r2});
}
int max12 = rank12[sa12[sa12.length-1]];
int[] sa0 = orderSuffixes(tA,tB,max<max12?max12:max,2);
//index conversion for sa0
for(int j=0;j<n0;j++){
sa0[j] = 3*sa0[j];
}
//step 3: merge sa12 and sa0
int[] sa = new int[len];
int i=0,j=0;
int k=0;
while(i<sa12.length && j<sa0.length){
int p = sa12[i];
int q = sa0[j];
if(p%3==1){
//case 1
if(rank[p]<rank[q]){
sa[k++] = p;i++;
}else if(rank[p]>rank[q]){
sa[k++] = q;j++;
}else{
if(rank12[p+1]<rank12[q+1]){
sa[k++] = p;i++;
}else{
sa[k++] = q;j++;
}
}
}else{
//case 2
if(rank[p]<rank[q]){
sa[k++] = p;i++;
}else if(rank[p]>rank[q]){
sa[k++] = q;j++;
}else{
if(rank[p+1]<rank[q+1]){
sa[k++] = p;i++;
}else if(rank[p+1]>rank[q+1]){
sa[k++] = q;j++;
}else{
if(rank12[p+2]<rank12[q+2]){
sa[k++] = p;i++;
}else{
sa[k++] = q;j++;
}
}
}
}
}
for(int m=i;m<sa12.length;m++){
sa[k++] = sa12[m];
}
for(int m=j;m<sa0.length;m++){
sa[k++] = sa0[m];
}
return sa;
}
//Precondition: the last char in text must be less than other chars.
private Suffix DC3(String text){
if(text == null)return null;
int len = text.length();
if(len == 0) return null;
char base = text.charAt(len-1); //the smallest char
Tuple[] tA = new Tuple[len];
Tuple[] tB = new Tuple[len]; //placeholder
for(int i=0;i<len;i++){
tA[i] = new Tuple(i,new int[]{0,text.charAt(i)-base});
}
Suffix suffix = rank(tA,tB,MAX_CHAR-base,1);
int max = suffix.rank[suffix.sa[len-1]];
int[] sa = skew(suffix.rank,max);
//caculate rank for result suffix array
int[] r = new int[len];
for(int k=0;k<sa.length;k++){
r[sa[k]] = k+1;
}
return new Suffix(sa,r);
}
public void buildChildtab(){
//step 1: caculate up and down value
Stack<Integer> stack = new Stack<Integer>();
int lastIndex = -1;
stack.push(0);
for(int i=1;i<len;i++){
while(lcptable[i]<lcptable[stack.peek()]){
lastIndex = stack.pop();
int next = stack.peek();
if(lcptable[i]<=lcptable[next]
&& lcptable[next] != lcptable[lastIndex]){
childtab[next].down = lastIndex;
}
}
if(lastIndex != -1){
childtab[i].up = lastIndex;
lastIndex = -1;
}
stack.push(i);
}
//process remaining elements
while(0<lcptable[stack.peek()]){
lastIndex = stack.pop();
int next = stack.peek();
if(0<=lcptable[next]
&& lcptable[next] != lcptable[lastIndex]){
childtab[next].down = lastIndex;
}
}
//step 2: caculate nextLIndex
stack.clear();
stack.push(0);
for(int i=1;i<len;i++){
while(lcptable[i]<lcptable[stack.peek()]){
stack.pop();
}
if(lcptable[i] == lcptable[stack.peek()]){
lastIndex = stack.pop();
childtab[lastIndex].nextLIndex = i;
}
stack.push(i);
}
//System.out.println();
}
class LCPInterval{
int lcp; //the lcp value of the lcp-interval
int lb; //the left boundary suffix index
int rb; //the right boundary suffix index
List<LCPInterval> children;
public LCPInterval(int lcp,int lb,int rb){
this.lcp = lcp;
this.lb = lb;
this.rb = rb;
}
public String toString(){
return String.format("%d-[%d..%d]",
this.lcp,this.lb,this.rb);
}
}
//Note: (i,j) != (0,n-1)
private List<LCPInterval> getChildIntervals(int i,int j){
List<LCPInterval> children = new ArrayList<LCPInterval>();
int up = -1;
if(j+1<len){
up = childtab[j+1].up;
}
int i1 = -1;
if(up>i && up<=j){
i1 = up;
}else{
i1 = childtab[i].down;
}
if(i<i1-1){
int lcp = getlcp(i,i1-1);
children.add(new LCPInterval(lcp,i,i1-1));
}
while(childtab[i1].nextLIndex != -1){
int i2 = childtab[i1].nextLIndex;
if(i1<i2-1){
int lcp = getlcp(i1,i2-1);
children.add(new LCPInterval(lcp,i1,i2-1));
}
i1 = i2;
}
if(i1<j){
int lcp = getlcp(i1,j);
children.add(new LCPInterval(lcp,i1,j));
}
return children;
}
//Note: (i,j) != (0,n-1)
private int getlcp(int i,int j){
if(i==j) return lcptable[i];
int up = -1;
if(j+1<len){
up = childtab[j+1].up;
}
if(up>i && up<=j){
return lcptable[up];
}else{
return lcptable[childtab[i].down];
}
}
private void topDownTraverse(LCPInterval interval){
List<LCPInterval> childIntervals = getChildIntervals(interval.lb,interval.rb);
interval.children = childIntervals;
reportLCPInterval(interval);
for(LCPInterval child:childIntervals){
topDownTraverse(child);
}
}
public void topDownTraverse(int[] lcptable){
this.lcptable = lcptable;
this.buildChildtab();
LCPInterval root = new LCPInterval(0,0,len-1);
List<LCPInterval> childIntervals = new ArrayList<LCPInterval>();
int i1 = this.childtab[0].nextLIndex;
if(i1-1>0){
int lcp = getlcp(0,i1-1);
childIntervals.add(new LCPInterval(lcp,0,i1-1));
}
while(childtab[i1].nextLIndex != -1){
int i2 = childtab[i1].nextLIndex;
if(i1<i2-1){
int lcp = getlcp(i1,i2-1);
childIntervals.add(new LCPInterval(lcp,i1,i2-1));
}
i1 = i2;
}
if(i1<len-1){
int lcp = getlcp(i1,len-1);
childIntervals.add(new LCPInterval(lcp,i1,len-1));
}
root.children = childIntervals;
reportLCPInterval(root);
for(LCPInterval child:childIntervals){
topDownTraverse(child);
}
}
private void reportLCPInterval(LCPInterval interval){
if(interval.children.size()>0){
StringBuilder sb = new StringBuilder();
for(LCPInterval child:interval.children){
sb.append(child.toString());
sb.append(",");
}
sb.deleteCharAt(sb.length()-1);
System.out.format("%s, children={%s}%n",
interval,sb.toString());
}else{
System.out.format("%s%n", interval);
}
}
//rank[p]'s index starts with 1 (not 0)
public int[] computeLCPTable(String text,int[] sa,int[] rank){
if(text == null)return null;
int len = text.length();
if(len == 0) return null;
int[] lcpz = new int[len];
//base case: p=0
//caculate LCP of suffix[0]
int lcp = 0;
int r = rank[0]-1;
if(r>0){
int q=sa[r-1];
//caculate LCP by definition
for(int i=0,j=q;i<len && j<len;i++,j++){
if(text.charAt(i) != text.charAt(j)){
lcp=i;
break;
}
}
}
lcpz[0] = lcp;
//other cases: p>=1
//ignore p == sa[0] because LCP=0 for suffix[p] where rank[p]=0
for(int p=1;p<len && p != sa[0];p++){
int h = lcpz[p-1];
int q=sa[rank[p]-2];
lcp = 0;
if(h>1){ //for h<=1, caculate LCP by definition (i.e. start with lcp=0)
//jump h-1 chars for suffix[p] and suffix[q]
lcp = h-1;
}
for(int i=p+lcp,j=q+lcp,k=0;i<len && j<len;i++,j++,k++){
if(text.charAt(i) != text.charAt(j)){
lcp+=k;
break;
}
}
lcpz[p] = lcp;
}
//caculate LCP
int[] LCP = new int[len];
for(int i=0;i<len;i++){
LCP[i] = lcpz[sa[i]];
}
return LCP;
}
public void solve(){
Suffix suffix = this.DC3(this.text);
int[] sa = suffix.sa;
int[] rank = suffix.rank;
int[] lcptable = this.computeLCPTable(this.text,sa,rank);
this.topDownTraverse(lcptable);
}
public static void main(String[] args) {
//int[] lcptable = {0,2,1,3,1,2,0,2,0,1,0};
//int[] lcptable = {0,0,3,3,2,2,1,1,0,1};
String text = "mississippi#";
System.out.format("Internal Nodes for text: %s %n%n",text);
ESA_TopDownTraversal esa = new ESA_TopDownTraversal(text);
esa.solve();
System.out.format("%n********************************%n");
text = "GACCCACCACC#";
System.out.format("Internal Nodes for text: %s %n%n",text);
esa = new ESA_TopDownTraversal(text);
esa.solve();
System.out.format("%n********************************%n");
text = "abcdefghijklmmnopqrstuvwxyz#";
System.out.format("Internal Nodes for text: %s %n%n",text);
esa = new ESA_TopDownTraversal(text);
esa.solve();
System.out.format("%n********************************%n");
text = "yabbadabbado#";
System.out.format("Internal Nodes for text: %s %n%n",text);
esa = new ESA_TopDownTraversal(text);
esa.solve();
System.out.format("%n********************************%n");
text = "AAAAAAAAAAAAAAAAAAAAAAAAAA#";
System.out.format("Internal Nodes for text: %s %n%n",text);
esa = new ESA_TopDownTraversal(text);
esa.solve();
System.out.format("%n********************************%n");
text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA#";
System.out.format("Internal Nodes for text: %s %n%n",text);
esa = new ESA_TopDownTraversal(text);
esa.solve();
}
}
测试:
Internal Nodes for text: mississippi#
0-[0..11], children={1-[1..4],1-[6..7],1-[8..11]}
1-[1..4], children={4-[3..4]}
4-[3..4]
1-[6..7]
1-[8..11], children={2-[8..9],3-[10..11]}
2-[8..9]
3-[10..11]
********************************
Internal Nodes for text: GACCCACCACC#
0-[0..11], children={3-[1..3],1-[4..10]}
3-[1..3]
1-[4..10], children={4-[5..6],2-[7..10]}
4-[5..6]
2-[7..10], children={5-[8..9]}
5-[8..9]
********************************
Internal Nodes for text: abcdefghijklmmnopqrstuvwxyz#
0-[0..27], children={1-[13..14]}
1-[13..14]
********************************
Internal Nodes for text: yabbadabbado#
0-[0..12], children={1-[1..4],1-[5..8],1-[9..10]}
1-[1..4], children={5-[1..2],2-[3..4]}
5-[1..2]
2-[3..4]
1-[5..8], children={3-[5..6],4-[7..8]}
3-[5..6]
4-[7..8]
1-[9..10]
********************************
Internal Nodes for text: AAAAAAAAAAAAAAAAAAAAAAAAAA#
0-[0..26], children={1-[1..26]}
1-[1..26], children={2-[2..26]}
2-[2..26], children={3-[3..26]}
3-[3..26], children={4-[4..26]}
4-[4..26], children={5-[5..26]}
5-[5..26], children={6-[6..26]}
6-[6..26], children={7-[7..26]}
7-[7..26], children={8-[8..26]}
8-[8..26], children={9-[9..26]}
9-[9..26], children={10-[10..26]}
10-[10..26], children={11-[11..26]}
11-[11..26], children={12-[12..26]}
12-[12..26], children={13-[13..26]}
13-[13..26], children={14-[14..26]}
14-[14..26], children={15-[15..26]}
15-[15..26], children={16-[16..26]}
16-[16..26], children={17-[17..26]}
17-[17..26], children={18-[18..26]}
18-[18..26], children={19-[19..26]}
19-[19..26], children={20-[20..26]}
20-[20..26], children={21-[21..26]}
21-[21..26], children={22-[22..26]}
22-[22..26], children={23-[23..26]}
23-[23..26], children={24-[24..26]}
24-[24..26], children={25-[25..26]}
25-[25..26]
********************************
Internal Nodes for text: GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA#
0-[0..53], children={1-[1..30],1-[31..34],1-[35..53]}
1-[1..30], children={2-[2..25],2-[26..30]}
2-[2..25], children={3-[3..24]}
3-[3..24], children={4-[4..23]}
4-[4..23], children={5-[5..22]}
5-[5..22], children={6-[6..22]}
6-[6..22], children={7-[7..22]}
7-[7..22], children={8-[8..22]}
8-[8..22], children={9-[9..22]}
9-[9..22], children={10-[10..22]}
10-[10..22], children={11-[11..22]}
11-[11..22], children={12-[12..22]}
12-[12..22], children={13-[13..22]}
13-[13..22], children={14-[14..22]}
14-[14..22], children={15-[15..22]}
15-[15..22], children={16-[16..22]}
16-[16..22], children={17-[17..22]}
17-[17..22], children={18-[18..22]}
18-[18..22], children={19-[19..22]}
19-[19..22], children={20-[20..22]}
20-[20..22], children={21-[21..22]}
21-[21..22]
2-[26..30], children={3-[26..28],3-[29..30]}
3-[26..28], children={5-[27..28]}
5-[27..28]
3-[29..30]
1-[31..34], children={2-[31..32],2-[33..34]}
2-[31..32]
2-[33..34]
1-[35..53], children={2-[35..38],2-[39..42],2-[43..53]}
2-[35..38], children={3-[36..38]}
3-[36..38], children={4-[36..37]}
4-[36..37]
2-[39..42], children={3-[39..40],3-[41..42]}
3-[39..40]
3-[41..42]
2-[43..53], children={3-[44..53]}
3-[44..53], children={4-[45..53]}
4-[45..53], children={5-[46..53]}
5-[46..53], children={6-[47..53]}
6-[47..53], children={7-[48..53]}
7-[48..53], children={8-[49..53]}
8-[49..53], children={9-[50..53]}
9-[50..53], children={10-[51..53]}
10-[51..53], children={11-[52..53]}
11-[52..53]