#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created on Dec 17, 2012
@author: honghe
'''
import pickle
class TrieNode(object):
def __init__(self):
self.count = 1 # 统计此结点代表的字符串出现的次数
self.children = {}
class Trie(object):
def __init__(self):
self.root = TrieNode()
def add(self, sequence):
node = self.root
for c in sequence:
if c not in node.children:
child = TrieNode()
node.children[c] = child
node = child
else:
node = node.children[c]
node.count = node.count + 1
def countSeq(self, sequence):
'''计算序列出现的次数
'''
node = self.root
for c in sequence:
if c not in node.children:
return 0
else:
node = node.children[c]
return node.count
def gen_trie(input_file, output_file):
'''生成trie树
'''
trie = Trie()
with open(input_file) as f:
for line in f:
# 增加'$'用来区别是否是完整后缀
line = line.strip() + '$'
for i in range(len(line)):
l = line[i:]
trie.add(l)
with open(output_file, 'wb') as f:
pickle.dump(trie, f)
return trie
if __name__ == '__main__':
txt = 'data.txt'
pkl = 'data.pkl'
t = gen_trie(txt, pkl)