suffix tree,python 版

 代码如下:

#
# A Suffix-tree implementation
#

NODE_COUNT = 1

class EdgeSet:
    '''
    Save edge, edge is identified by start node and first char
    '''
    def __init__(self):
        self.dict = {}

    def add(self,start_node,first_char,edge):
        key = str(start_node) + first_char
        self.dict[key] = edge

    def remove(self,start_node,first_char):
        key = str(start_node) + first_char
        del self.dict[key]

    def seek(self,start_node,first_char):
        key = str(start_node) + first_char

        try:
            return self.dict[key]
        except:
            return None

class Suffix:
    '''
    Denote a Node in the tree,a node is identified by the origin_node(start node),first_char_index and last_char_index
    A node has two types:
    1. explicit: first_char_index > last_char_index, means the node do not encounter repeat char
    2. implicit: last_char_index >= first_char_index, means the node has encounter a repeat char, so the node should be split
   
    for example, the input string is abadabc
    the initial first_char_index = 0, last_char_index = -1
    so the sequence of first_char_index and last_char_index is listed as follow:
    a 1,0
    b 2,1
    a 2,2
    ....
    Be careful with the last_char_index and first_char_index
    '''
    def __init__(self,origin_node,first_char_index,last_char_index):
        self.origin_node = origin_node
        self.first_char_index = first_char_index
        self.last_char_index = last_char_index
   
    def implicit(self):
        return self.last_char_index >= self.first_char_index

    def explicit(self):
        return self.first_char_index > self.last_char_index
   
    def canonize(self,edgeSet,string):
        '''
        move the origin node of self to the closet of the input char. when we encouter a node which only has one char, and has children, we
        should move the root to the nearest father of the children
        '''
        if not self.explicit():
            edge = edgeSet.seek(self.origin_node,string[self.first_char_index])
            edge_span = edge.last_char_index - edge.first_char_index
           
            while edge_span <= self.last_char_index - self.first_char_index:
                self.first_char_index += edge_span + 1
                #move down the root
                self.origin_node = edge.end_node
               
                if(self.first_char_index <= self.last_char_index):
                    edge = edgeSet.seek(edge.end_node,string[self.first_char_index])
                    edge_span = edge.first_char_index - edge.last_char_index
           

class Node:
    '''
    suffix node list
    '''
    def __init__(self,suffix_node = -1):
        self.suffix_node = suffix_node
       
class Edge:
    '''
    Edge, start_node,first_char_index,last_char_index
    '''
    def __init__(self,start_node,first_char_index,last_char_index,string,edgeSet):
        global NODE_COUNT

        self.start_node = start_node
        self.first_char_index = first_char_index
       
        self.last_char_index = last_char_index
        self.string = string

        self.edgeSet = edgeSet

        self.end_node = NODE_COUNT
        NODE_COUNT += 1
       

    def insert(self):
        self.edgeSet.add(self.start_node,self.string[self.first_char_index],self)

    def remove(self):
        self.edgeSet.remove(self.start_node,self.string[self.first_char_index])
   
    def __str__(self):
        return '' + str(self.start_node) + '->' + str(self.end_node) + ':' + self.string[self.first_char_index:self.last_char_index + 1]

    def __eq__(self,val):
        return self.start_node == val.start_node and self.first_char_index == val.first_char_index

    def splitEdge(self,suffix,nodes,string,edgeSet):
       self.remove()
       new_edge = Edge(suffix.origin_node,self.first_char_index, self.first_char_index +  /
                suffix.last_char_index - suffix.first_char_index  ,string,edgeSet)

       new_edge.insert()
       self.start_node = new_edge.end_node
       self.first_char_index = new_edge.last_char_index + 1
       self.insert()
      
       nodes[new_edge.end_node].suffix_node = suffix.origin_node
      
       return new_edge.end_node

def suffix_tree(active, char_index,edgeSet,string,str_len,nodes):
    last_parent_node = -1
    parent_node = -1
   
    while True:
        parent_node = active.origin_node

        if(active.explicit()):
            edge = edgeSet.seek(parent_node,string[char_index])
            #already exist an edge start with the char
            if edge != None:
                break
        else:
            span = active.last_char_index - active.first_char_index
            edge = edgeSet.seek(parent_node,string[active.first_char_index])
            print span,edge.first_char_index,char_index,active.first_char_index
           
            if(string[edge.first_char_index + span + 1] == string[char_index]):
                break
            else:
                parent_node = edge.splitEdge(active,nodes,string,edgeSet)
       
        edge = Edge(parent_node,char_index,str_len,string,edgeSet)
        edge.insert()
       
        if last_parent_node > 0:
            nodes[last_parent_node].suffix_node = parent_node
           
        last_parent_node = parent_node
       
        if active.origin_node == 0:
            active.first_char_index += 1
        else:
            active.origin_node = nodes[active.origin_node].suffix_node
       
        active.canonize(edgeSet,string)
    if last_parent_node > 0:
        nodes[last_parent_node].suffix_node = parent_node
    active.last_char_index += 1
    active.canonize(edgeSet,string)


if __name__ == "__main__":
    nodes = []
    edges = EdgeSet()
   
    string = "abababc"
    active = Suffix(0,0,-1)
    str_len = len(string)
   
    for i in range(str_len):
        nodes.append(Node())
        nodes.append(Node())
       
    for i in range(str_len):
        suffix_tree(active, i, edges, string, str_len - 1, nodes)
        print '---------:',string[i],active.first_char_index,active.last_char_index
   
    for edge in  edges.dict.values():
        print edge
   
   
        
主要要弄清楚active的first_char_index 和 last_char_index

 

 

相关文章链接:

http://marknelson.us/1996/08/01/suffix-trees/

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值