问题:
有N个字符串,计算它们公共的最长连续子串。例如,字符串集合{abcd,bcca,aabc},它们公共的最长连续子串是bc。又如{abcfd,bcabca,aabcf}的公共的最长连续子串是abc。
针对本问题,可以用特殊的字符(该字符必须是N个串中没有出现的字符)将N个字符串连起来形成一个新串,然后构建这个新串的后缀树。比如对字符串集合 {abcd,bcca,aabc},可以连成新串abcd$bcca@aabc%,其中子串之间的分隔符为 '$','@','%'。满足条件的最长子串就是最深的分叉结点,而且该分叉结点下面的后缀的suffix(注意是后缀的后缀!)包含了这三个分隔字符:即至少有一个后缀的suffix为xxx$.....,至少有一个后缀的suffix为yyy@....(这里字符串yyy中没有字符$),至少有一个后缀的suffix为zzz%...(其中字符串zzz中没有$和@字符)。以下代码中,判断分隔字符是否在子树中的函数为 containTerminators()。
利用这种方式生成的后缀树有一个特点:带终结字符的结点都是叶子结点,不可能为内部结点,这一点跟前缀树和不带分隔符的单个子串后缀树是不同的(类似于Ukkonen构造法中末尾添加一个特殊字符将隐式后缀树自动转换成真正的后缀树)。利用这个特点,只需要考虑:非叶子结点的情况,因为满足条件的分叉结点不会是叶子结点。
在下面的实现中,首先采用蛮力法insert()构造后缀树;然后调用findLCS()查找最长公共子串。只需要考虑非叶子结点(child.terminal = false)。findLCS函数返回的是currNode的子树中满足条件的LCS。
实现:
import java.util.LinkedList;
import java.util.List;
/**
*
* Find LCS(Longest Common Substring) of N strings (N>=2)
*
* Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
* Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php)
*
* @author ljs
* 2011-06-29
*
*/
public class LCS {
private class SuffixNode {
private String key;
private List<SuffixNode> children = new LinkedList<SuffixNode>();
//use "#" for terminal char
private boolean terminal;
public SuffixNode(){
this.key = "";
}
public SuffixNode(String key){
this.key = key;
}
public String toString(){
return this.key + (this.terminal?"#":"") + "(" + children.size() +")";
}
}
private SuffixNode root;
private String text;
//terminators should be ordered according to input strings
private char[] terminators;
public LCS(String text,char[] terminators){
this.text = text;
this.terminators = terminators;
}
private void insert(SuffixNode currNode,String key,int startIndex) throws Exception{
boolean done = false;
for(int i=0;i<currNode.children.size();i++){
SuffixNode child = currNode.children.get(i);
//use min(child.key.length, key.length)
int len = child.key.length()<key.length()?child.key.length():
key.length();
int j = 0;
for(;j<len;j++){
if(key.charAt(j) != child.key.charAt(j)){
break;
}
}
if(j==0){//this child doesn't match any character with the new key
//order keys by lexi-order
if(key.charAt(0)<child.key.charAt(0)){
//e.g. child="e" (currNode="abc")
// abc abc
// / / =========> / | /
// e f insert "c" c# e f
SuffixNode node = new SuffixNode(key);
currNode.children.add(i,node);
node.terminal = true;
done = true;
break;
}else{ //key.charAt(0)>child.key.charAt(0)
//don't forget to add the largest new key after iterating all children
continue;
}
}else{//current child's key partially matches with the new key; 0<j<=len
if(j==len){
if(key.length()==child.key.length()){
if(child.terminal){
throw new Exception("Duplicate Key is found when insertion!");
}else{
//e.g. child="ab"
// ab ab#
// / / =========> / /
// e f insert "ab" e f
child.terminal = true;
}
}else if(key.length()>child.key.length()){
//e.g. child="ab#"
// ab# ab#
// / / ==========> / | /
// e f insert "abc" c# e f
String subkey = key.substring(j);
//recursion
insert(child,subkey,startIndex+j);
}else{ //key.length()<child.key.length()
//e.g. child="abc#"
// abc# ab#
// / / =========> /
// e f insert "ab" c#
// / /
// e f
String childSubkey = child.key.substring(j); //c
SuffixNode subChildNode = new SuffixNode(childSubkey);
subChildNode.terminal = child.terminal;
subChildNode.children = child.children; //inherited from parent
child.key = key; //ab
child.terminal = true; //ab#
child.children = new LinkedList<SuffixNode>();
child.children.add(subChildNode);
}
}else{//0<j<len
//e.g. child="abc#"
// abc# ab
// / / ==========> / /
// e f insert "abd" c# d#
// / /
// e f
//split at j
String childSubkey = child.key.substring(j); //c
String subkey = key.substring(j); //d
SuffixNode subChildNode = new SuffixNode(childSubkey);
subChildNode.terminal = child.terminal;
subChildNode.children = child.children; //inherited from parent
//update child's key
child.key = child.key.substring(0,j);
//child is not terminal now due to split, it is inherited by subChildNode
child.terminal = false;
//Note: no need to merge subChildNode
SuffixNode node = new SuffixNode(subkey);
node.terminal = true;
child.children = new LinkedList<SuffixNode>();
if(subkey.charAt(0)<childSubkey.charAt(0)){
child.children.add(node);
child.children.add(subChildNode);
}else{
child.children.add(subChildNode);
child.children.add(node);
}
}
done = true;
break;
}
}
if(!done){
SuffixNode node = new SuffixNode(key);
node.terminal = true;
currNode.children.add(node);
}
}
public void insert(String suffix,int startIndex) throws Exception{
if(suffix == null || suffix.length() == 0) return;
if(root==null){
root = new SuffixNode();
}
insert(root,suffix,startIndex);
}
//build a suffix-tree for a string of text
public void buildSuffixTree() throws Exception{
for(int i=0;i<text.length();i++){
this.insert(text.substring(i), i);
}
}
//for test purpose only
public void printTree(){
this.print(0, this.root);
}
private void print(int level, SuffixNode node){
for (int i = 0; i < level; i++) {
System.out.format(" ");
}
System.out.format("|");
for (int i = 0; i < level; i++) {
System.out.format("-");
}
if (node.terminal)
System.out.format("%s#%n", node.key);
else
System.out.format("%s%n", node.key);
for (SuffixNode child : node.children) {
print(level + 1, child);
}
}
public String findLCS(){
return findLCS(root);
}
//return the longest substring starting with current node (but not including currNode.key)
private String findLCS(SuffixNode currNode){
int maxDepth = currNode.key.length();
int currDepth = currNode.key.length();
String longestSubstrSuffix = "";
for(int i=0;i<currNode.children.size();i++){
SuffixNode child = currNode.children.get(i);
if(!child.terminal){
int depth = currDepth + child.key.length();
//terminators are unique, so terminal child is excluded
boolean containsTerminators = containTerminators(child);
if(containsTerminators){
if(depth > maxDepth){
maxDepth = depth;
longestSubstrSuffix = child.key;
}
String longestChildSubstrSuffix = findLCS(child);
if(longestChildSubstrSuffix.length()>0){ //not a part of LCS if longestChildSubstrSuffix's lenght is 0
int maxChildDepth = longestChildSubstrSuffix.length() + depth;
if(maxChildDepth > maxDepth){
maxDepth = maxChildDepth;
//the substring is relative to currNode
longestSubstrSuffix = child.key + longestChildSubstrSuffix;
}
}
}
}
}
return longestSubstrSuffix;
}
private boolean containTerminators(SuffixNode currNode){
boolean[] done = new boolean[terminators.length];
return containTerminators(currNode,done);
}
private boolean containTerminators(SuffixNode currNode,boolean[] done){
boolean allDone = false;
for(int i=0;i<currNode.children.size();i++){
SuffixNode child = currNode.children.get(i);
if(child.terminal){
//Note: here the order of terminator is important
for(int j=0;j<terminators.length;j++){
int pos = child.key.indexOf(terminators[j]);
if(pos>=0){
done[j]=true;
break;
}
}
}else{
containTerminators(child,done);
}
allDone = true;
for(int j=0;j<done.length;j++){
if(!done[j]){
allDone = false;
break;
}
}
if(allDone)
break;
}
return allDone;
}
public static void main(String[] args) throws Exception {
System.out.println("****************************");
System.out.format("LCS for 3 strings:{abc,bcabca,aabcf}%n");
String text = "abc$bcabca@aabcf%";
LCS strie = new LCS(text,new char[]{'
,'@','%'});strie.buildSuffixTree();//strie.printTree();String longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);System.out.println("****************************");System.out.format("LCS for 3 strings:{abcd,bcca,aabc}%n");text = "abcd$bcca@aabc%";strie = new LCS(text,new char[]{'
,'@','%'});strie.buildSuffixTree();//strie.printTree();longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);System.out.println("****************************");System.out.format("LCS for 2 strings:{asdfldkjxlfjax123456789abckljddfdfe123456789ees, xsdldkjalfla123456789abcfleur123456789ljafa}%n");text = "asdfldkjxlfjax123456789abckljddfdfe123456789ees$xsdldkjalfla123456789abcfleur123456789ljafa@";strie = new LCS(text,new char[]{'
,'@'});strie.buildSuffixTree();//strie.printTree();longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);System.out.println("****************************");System.out.format("LCS for 4 strings:{abcd,abce,abd,bdess}%n");text = "abcd$abce@abd%bdess&";strie = new LCS(text,new char[]{'
,'@','%','&'});strie.buildSuffixTree();//strie.printTree();longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);}}
测试输出:
****************************
LCS for 3 strings:{abc,bcabca,aabcf}
abc
****************************
LCS for 3 strings:{abcd,bcca,aabc}
bc
****************************
LCS for 2 strings:{asdfldkjxlfjax123456789abckljddfdfe123456789ees, xsdldkjalfla123456789abcfleur123456789ljafa}
123456789abc
****************************
LCS for 4 strings:{abcd,abce,abd,bdess}
b