代码如下:
#
# A Suffix-tree implementation
#
NODE_COUNT = 1
class EdgeSet:
'''
Save edge, edge is identified by start node and first char
'''
def __init__(self):
self.dict = {}
def add(self,start_node,first_char,edge):
key = str(start_node) + first_char
self.dict[key] = edge
def remove(self,start_node,first_char):
key = str(start_node) + first_char
del self.dict[key]
def seek(self,start_node,first_char):
key = str(start_node) + first_char
try:
return self.dict[key]
except:
return None
class Suffix:
'''
Denote a Node in the tree,a node is identified by the origin_node(start node),first_char_index and last_char_index
A node has two types:
1. explicit: first_char_index > last_char_index, means the node do not encounter repeat char
2. implicit: last_char_index >= first_char_index, means the node has encounter a repeat char, so the node should be split
for example, the input string is abadabc
the initial first_char_index = 0, last_char_index = -1
so the sequence of first_char_index and last_char_index is listed as follow:
a 1,0
b 2,1
a 2,2
....
Be careful with the last_char_index and first_char_index
'''
def __init__(self,origin_node,first_char_index,last_char_index):
self.origin_node = origin_node
self.first_char_index = first_char_index
self.last_char_index = last_char_index
def implicit(self):
return self.last_char_index >= self.first_char_index
def explicit(self):
return self.first_char_index > self.last_char_index
def canonize(self,edgeSet,string):
'''
move the origin node of self to the closet of the input char. when we encouter a node which only has one char, and has children, we
should move the root to the nearest father of the children
'''
if not self.explicit():
edge = edgeSet.seek(self.origin_node,string[self.first_char_index])
edge_span = edge.last_char_index - edge.first_char_index
while edge_span <= self.last_char_index - self.first_char_index:
self.first_char_index += edge_span + 1
#move down the root
self.origin_node = edge.end_node
if(self.first_char_index <= self.last_char_index):
edge = edgeSet.seek(edge.end_node,string[self.first_char_index])
edge_span = edge.first_char_index - edge.last_char_index
class Node:
'''
suffix node list
'''
def __init__(self,suffix_node = -1):
self.suffix_node = suffix_node
class Edge:
'''
Edge, start_node,first_char_index,last_char_index
'''
def __init__(self,start_node,first_char_index,last_char_index,string,edgeSet):
global NODE_COUNT
self.start_node = start_node
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.string = string
self.edgeSet = edgeSet
self.end_node = NODE_COUNT
NODE_COUNT += 1
def insert(self):
self.edgeSet.add(self.start_node,self.string[self.first_char_index],self)
def remove(self):
self.edgeSet.remove(self.start_node,self.string[self.first_char_index])
def __str__(self):
return '' + str(self.start_node) + '->' + str(self.end_node) + ':' + self.string[self.first_char_index:self.last_char_index + 1]
def __eq__(self,val):
return self.start_node == val.start_node and self.first_char_index == val.first_char_index
def splitEdge(self,suffix,nodes,string,edgeSet):
self.remove()
new_edge = Edge(suffix.origin_node,self.first_char_index, self.first_char_index + /
suffix.last_char_index - suffix.first_char_index ,string,edgeSet)
new_edge.insert()
self.start_node = new_edge.end_node
self.first_char_index = new_edge.last_char_index + 1
self.insert()
nodes[new_edge.end_node].suffix_node = suffix.origin_node
return new_edge.end_node
def suffix_tree(active, char_index,edgeSet,string,str_len,nodes):
last_parent_node = -1
parent_node = -1
while True:
parent_node = active.origin_node
if(active.explicit()):
edge = edgeSet.seek(parent_node,string[char_index])
#already exist an edge start with the char
if edge != None:
break
else:
span = active.last_char_index - active.first_char_index
edge = edgeSet.seek(parent_node,string[active.first_char_index])
print span,edge.first_char_index,char_index,active.first_char_index
if(string[edge.first_char_index + span + 1] == string[char_index]):
break
else:
parent_node = edge.splitEdge(active,nodes,string,edgeSet)
edge = Edge(parent_node,char_index,str_len,string,edgeSet)
edge.insert()
if last_parent_node > 0:
nodes[last_parent_node].suffix_node = parent_node
last_parent_node = parent_node
if active.origin_node == 0:
active.first_char_index += 1
else:
active.origin_node = nodes[active.origin_node].suffix_node
active.canonize(edgeSet,string)
if last_parent_node > 0:
nodes[last_parent_node].suffix_node = parent_node
active.last_char_index += 1
active.canonize(edgeSet,string)
if __name__ == "__main__":
nodes = []
edges = EdgeSet()
string = "abababc"
active = Suffix(0,0,-1)
str_len = len(string)
for i in range(str_len):
nodes.append(Node())
nodes.append(Node())
for i in range(str_len):
suffix_tree(active, i, edges, string, str_len - 1, nodes)
print '---------:',string[i],active.first_char_index,active.last_char_index
for edge in edges.dict.values():
print edge
主要要弄清楚active的first_char_index 和 last_char_index
相关文章链接:
http://marknelson.us/1996/08/01/suffix-trees/