/******************************************************************************/
/*
trace_single_edge :
Traces for a string in a given node's OUTcoming edge. It searches only in the
given edge and not other ones. Search stops when either whole string was
found in the given edge, a part of the string was found but the edge ended
(and the next edge must be searched too - performed by function trace_string)
or one non-matching character was found.
Input : The string to be searched, given in indices of the main string.
Output: (by value) the node where tracing has stopped.
(by reference) the edge position where last match occured, the string
position where last match occured, number of characters found, a flag
for signaling whether search is done, and a flag to signal whether
search stopped at a last character of an edge.
*/
NODE trace_single_edge(
SUFFIX_TREE tree,
/* Node to start from */
NODE node,
/* String to trace */
PATH str,
/* Last matching position in edge */
MyInteger edge_pos,
/* Last matching position in tree source string */
MyInteger chars_found,
/* Skip or no_skip*/
SKIP_TYPE type,
/* 1 if search is done, 0 if not */
MyInteger search_done)
{
NODE cont_node;
int length,str_len;
/* Set default return values */
search_done.intValue = 1;
edge_pos.intValue = 0;
/* Search for the first character of the string in the outcoming edge of
node */
cont_node = find_son(tree, node, tree.tree_string.charAt(str.begin));
if(cont_node == null)
{
/* Search is done, string not found */
edge_pos.intValue = get_node_label_length(tree,node)-1;
chars_found.intValue = 0;
return node;
}
/* Found first character - prepare for continuing the search */
node = cont_node;
length = get_node_label_length(tree,node);
str_len = str.end - str.begin + 1;
/* Compare edge length and string length. */
/* If edge is shorter then the string being searched and skipping is
enabled - skip edge */
if(type == SKIP_TYPE.skip)
{
if(length <= str_len)
{
(chars_found.intValue) = length;
(edge_pos.intValue) = length-1;
if(length < str_len)
search_done.intValue = 0;
}
else
{
(chars_found.intValue) = str_len;
(edge_pos.intValue) = str_len-1;
}
counter++;
return node;
}
else
{
/* Find minimum out of edge length and string length, and scan it */
if(str_len < length)
length = str_len;
for(edge_pos.intValue=1, chars_found.intValue=1; edge_pos.intValue
{
counter++;
/* Compare current characters of the string and the edge. If equal -
continue */
if(tree.tree_string.charAt(node.edge_label_start+edge_pos.intValue) != tree.tree_string.charAt(str.begin+edge_pos.intValue))
{
(edge_pos.intValue)--;
return node;
}
}
}
/* The loop has advanced edge_pos.intValue one too much */
(edge_pos.intValue)--;
if((chars_found.intValue) < str_len)
/* Search is not done yet */
search_done.intValue = 0;
return node;
}
/******************************************************************************/
/*
trace_string :
Traces for a string in the tree. This function is used in construction
process only, and not for after-construction search of substrings. It is
tailored to enable skipping (when we know a suffix is in the tree (when
following a suffix link) we can avoid comparing all symbols of the edge by
skipping its length immediately and thus save atomic operations - see
Ukkonen's algorithm, skip trick).
This function, in contradiction to the function trace_single_edge, 'sees' the
whole picture, meaning it searches a string in the whole tree and not just in
a specific edge.
Input : The string, given in indice of the main string.
Output: (by value) the node where tracing has stopped.
(by reference) the edge position where last match occured, the string
position where last match occured, number of characters found, a flag
for signaling whether search is done.
*/
NODE trace_string(
SUFFIX_TREE tree,
/* Node to start from */
NODE node,
/* String to trace */
PATH str,
/* Last matching position in edge */
MyInteger edge_pos,
/* Last matching position in tree string */
MyInteger chars_found,
/* skip or not */
SKIP_TYPE type)
{
/* This variable will be 1 when search is done.
It is a return value from function trace_single_edge */
MyInteger search_done = new MyInteger(0);
/* This variable will hold the number of matching characters found in the
current edge. It is a return value from function trace_single_edge */
MyInteger edge_chars_found = new MyInteger(0);
chars_found.intValue = 0;
while(search_done.intValue == 0)
{
edge_pos.intValue = 0;
edge_chars_found.intValue = 0;
// str is changed in trace_single_edge? not changed. safe.
// clone str
PATH path = new PATH();
path.begin = str.begin;
path.end = str.end;
node = trace_single_edge(tree, node, path, edge_pos, edge_chars_found, type, search_done);
str.begin += edge_chars_found.intValue;
chars_found.intValue += edge_chars_found.intValue;
}
return node;
}
/******************************************************************************/
/*
follow_suffix_link :
Follows the suffix link of the source node according to Ukkonen's rules.
Input : The tree, and pos. pos is a combination of the source node and the
position in its incoming edge where suffix ends.
Output: The destination node that represents the longest suffix of node's
path. Example: if node represents the path "abcde" then it returns
the node that represents "bcde".
*/
void follow_suffix_link(SUFFIX_TREE tree, POS pos)
{
/* gama is the string between node and its father, in case node doesn't have
a suffix link */
PATH gama = new PATH();
/* dummy argument for trace_string function */
MyInteger chars_found = new MyInteger(0);
if(pos.node == tree.root)
{
return;
}
/* If node has no suffix link yet or in the middle of an edge - remember the
edge between the node and its father (gama) and follow its father's suffix
link (it must have one by Ukkonen's lemma). After following, trace down
gama - it must exist in the tree (and thus can use the skip trick - see
trace_string function description) */
if(pos.node.suffix_link == null || is_last_char_in_edge(tree,pos.node,pos.edge_pos.intValue) == 0)
{
/* If the node's father is the root, than no use following it's link (it
is linked to itself). Tracing from the root (like in the naive
algorithm) is required and is done by the calling function SEA uppon
recieving a return value of tree.root from this function */
if(pos.node.father == tree.root)
{
pos.node = tree.root;
return;
}
/* Store gama - the indices of node's incoming edge */
gama.begin = pos.node.edge_label_start;
gama.end = pos.node.edge_label_start + pos.edge_pos.intValue;
/* Follow father's suffix link */
pos.node = pos.node.father.suffix_link;
/* Down-walk gama back to suffix_link's son */
// clone the PATH
PATH tmpGama = new PATH();
tmpGama.begin = gama.begin;
tmpGama.end = gama.end;
pos.node = trace_string(tree, pos.node, tmpGama, (pos.edge_pos), chars_found, SKIP_TYPE.skip);
}
else
{
/* If a suffix link exists - just follow it */
pos.node = pos.node.suffix_link;
pos.edge_pos.intValue = get_node_label_length(tree,pos.node)-1;
}
}
/******************************************************************************/
/*
create_suffix_link :
Creates a suffix link between node and the node 'link' which represents its
largest suffix. The function could be avoided but is needed to monitor the
creation of suffix links when debuging or changing the tree.
Input : The node to link from, the node to link to.
Output: None.
*/
void create_suffix_link(NODE node, NODE link)
{
node.suffix_link = link;
}