qsufsort是开源差分工具bsdiff使用的后缀树生成算法。
qsufsort实现原理为Jesper Larsson的Faster Suffix Sorting算法。
package main
import "fmt"
func split(I []int, V []int, start, len_, h int) {
if len(I) < 16 {
j := 0
for k := start; k < start+len_; k += j {
j = 1
x := V[I[k]+h]
for i := 1; k+i < start+len_; i++ {
if V[I[k+i]+h] < x {
x = V[I[k+i]+h]
j = 0
}
if V[I[k+i]+h] == x {
I[k+i], I[k+j] = I[k+j], I[k+i]
j++
}
}
for i := 0; i < j; i++ {
V[I[k+i]] = k + j - 1
}
if j == 1 {
I[k] = -1
}
}
return
}
x := V[I[start+len_/2]+h]
jj := 0
kk := 0
for i := start; i < start+len_; i++ {
if V[I[i]+h] < x {
jj++
}
if V[I[i]+h] == x {
kk++
}
}
jj += start
kk += jj
j := 0
k := 0
for i := start; i < jj; {
if V[I[i]+h] < x {
i++
} else if V[I[i]+h] == x {
I[i], I[jj+j] = I[jj+j], I[i]
j++
} else {
I[i], I[kk+k] = I[kk+k], I[i]
k++
}
}
for jj+j < kk {
if V[I[jj+j]+h] == x {
j++
} else {
I[jj+j], I[kk+k] = I[kk+k], I[jj+j]
k++
}
}
if jj > start {
split(I, V, start, jj-start, h)
}
for i := 0; i < kk-jj; i++ {
V[I[jj+i]] = kk - 1
}
if jj == kk-1 {
I[jj] = -1
}
if start+len_ > kk {
split(I, V, kk, start+len_-kk, h)
}
}
func Qsufsort(old []byte) []int {
var buckets [256]int
for i := 0; i < len(old); i++ {
buckets[old[i]]++
}
buckets[255] = len(old) - buckets[255]
for i := 254; i >= 0; i-- {
buckets[i] = buckets[i+1] - buckets[i]
}
I := make([]int, len(old)+1)
for i := 0; i < len(old); i++ {
buckets[old[i]]++
I[buckets[old[i]]] = i
}
I[0] = len(old)
V := make([]int, len(old)+1)
for i := 0; i < len(old); i++ {
V[i] = buckets[old[i]]
}
V[len(old)] = 0
for i := 1; i < 256; i++ {
if buckets[i] == buckets[i-1]+1 {
I[buckets[i]] = -1
}
}
I[0] = -1
for h := 1; I[0] != -len(I); h += h {
len_ := 0
i := 0
for i = 0; i < len(I); {
if I[i] < 0 {
len_ -= I[i]
i -= I[i]
} else {
if len_ != 0 {
I[i-len_] = -len_
}
len_ = V[I[i]] + 1 - i
split(I, V, i, len_, h)
i += len_
len_ = 0
}
}
if len_ != 0 {
I[i-len_] = -len_
}
}
for i := 0; i < len(I); i++ {
I[V[i]] = i
}
return I
}
func main() {
str := "hello word"
I := Qsufsort([]byte(str))
for i, v := range I {
fmt.Println(i, str[v:])
}
}
编译输出结果:
0
1 word
2 d
3 ello word
4 hello word
5 llo word
6 lo word
7 o word
8 ord
9 rd
10 word