suffix tree，python 版

最新推荐文章于 2024-08-06 17:45:33 发布

tianqio

最新推荐文章于 2024-08-06 17:45:33 发布

阅读量2.1k

点赞数

文章标签： tree python string class input insert

本文链接：https://blog.csdn.net/tianqio/article/details/4092914

版权

代码如下：

#
# A Suffix-tree implementation
#

NODE_COUNT = 1

class EdgeSet:
    '''
    Save edge, edge is identified by start node and first char
    '''
    def __init__(self):
        self.dict = {}

    def add(self,start_node,first_char,edge):
        key = str(start_node) + first_char
        self.dict[key] = edge

    def remove(self,start_node,first_char):
        key = str(start_node) + first_char
        del self.dict[key]

    def seek(self,start_node,first_char):
        key = str(start_node) + first_char

        try:
            return self.dict[key]
        except:
            return None

class Suffix:
    '''
    Denote a Node in the tree,a node is identified by the origin_node(start node),first_char_index and last_char_index
    A node has two types:
    1. explicit: first_char_index > last_char_index, means the node do not encounter repeat char
    2. implicit: last_char_index >= first_char_index, means the node has encounter a repeat char, so the node should be split

    for example, the input string is abadabc
    the initial first_char_index = 0, last_char_index = -1
    so the sequence of first_char_index and last_char_index is listed as follow:
    a 1,0
    b 2,1
    a 2,2
    ....
    Be careful with the last_char_index and first_char_index
    '''
    def __init__(self,origin_node,first_char_index,last_char_index):
        self.origin_node = origin_node
        self.first_char_index = first_char_index
        self.last_char_index = last_char_index

    def implicit(self):
        return self.last_char_index >= self.first_char_index

    def explicit(self):
        return self.first_char_index > self.last_char_index

    def canonize(self,edgeSet,string):
        '''
        move the origin node of self to the closet of the input char. when we encouter a node which only has one char, and has children, we
        should move the root to the nearest father of the children
        '''
        if not self.explicit():
            edge = edgeSet.seek(self.origin_node,string[self.first_char_index])
            edge_span = edge.last_char_index - edge.first_char_index

            while edge_span <= self.last_char_index - self.first_char_index:
                self.first_char_index += edge_span + 1
                #move down the root
                self.origin_node = edge.end_node

                if(self.first_char_index <= self.last_char_index):
                    edge = edgeSet.seek(edge.end_node,string[self.first_char_index])
                    edge_span = edge.first_char_index - edge.last_char_index


class Node:
    '''
    suffix node list
    '''
    def __init__(self,suffix_node = -1):
        self.suffix_node = suffix_node

class Edge:
    '''
    Edge, start_node,first_char_index,last_char_index
    '''
    def __init__(self,start_node,first_char_index,last_char_index,string,edgeSet):
        global NODE_COUNT

        self.start_node = start_node
        self.first_char_index = first_char_index

        self.last_char_index = last_char_index
        self.string = string

        self.edgeSet = edgeSet

        self.end_node = NODE_COUNT
        NODE_COUNT += 1


    def insert(self):
        self.edgeSet.add(self.start_node,self.string[self.first_char_index],self)

    def remove(self):
        self.edgeSet.remove(self.start_node,self.string[self.first_char_index])

    def __str__(self):
        return '' + str(self.start_node) + '->' + str(self.end_node) + ':' + self.string[self.first_char_index:self.last_char_index + 1]

    def __eq__(self,val):
        return self.start_node == val.start_node and self.first_char_index == val.first_char_index

    def splitEdge(self,suffix,nodes,string,edgeSet):
       self.remove()
       new_edge = Edge(suffix.origin_node,self.first_char_index, self.first_char_index + /
                suffix.last_char_index - suffix.first_char_index ,string,edgeSet)

       new_edge.insert()
       self.start_node = new_edge.end_node
       self.first_char_index = new_edge.last_char_index + 1
       self.insert()

       nodes[new_edge.end_node].suffix_node = suffix.origin_node

       return new_edge.end_node

def suffix_tree(active, char_index,edgeSet,string,str_len,nodes):
    last_parent_node = -1
    parent_node = -1

    while True:
        parent_node = active.origin_node

        if(active.explicit()):
            edge = edgeSet.seek(parent_node,string[char_index])
            #already exist an edge start with the char
            if edge != None:
                break
        else:
            span = active.last_char_index - active.first_char_index
            edge = edgeSet.seek(parent_node,string[active.first_char_index])
            print span,edge.first_char_index,char_index,active.first_char_index

            if(string[edge.first_char_index + span + 1] == string[char_index]):
                break
            else:
                parent_node = edge.splitEdge(active,nodes,string,edgeSet)

        edge = Edge(parent_node,char_index,str_len,string,edgeSet)
        edge.insert()

        if last_parent_node > 0:
            nodes[last_parent_node].suffix_node = parent_node

        last_parent_node = parent_node

        if active.origin_node == 0:
            active.first_char_index += 1
        else:
            active.origin_node = nodes[active.origin_node].suffix_node

        active.canonize(edgeSet,string)
    if last_parent_node > 0:
        nodes[last_parent_node].suffix_node = parent_node
    active.last_char_index += 1
    active.canonize(edgeSet,string)

if __name__ == "__main__":
    nodes = []
    edges = EdgeSet()

    string = "abababc"
    active = Suffix(0,0,-1)
    str_len = len(string)

    for i in range(str_len):
        nodes.append(Node())
        nodes.append(Node())

    for i in range(str_len):
        suffix_tree(active, i, edges, string, str_len - 1, nodes)
        print '---------:',string[i],active.first_char_index,active.last_char_index

    for edge in edges.dict.values():
        print edge



主要要弄清楚active的first_char_index 和 last_char_index