Trie树实现词频统计与查找

最新推荐文章于 2024-05-09 17:52:56 发布

笛在月明

最新推荐文章于 2024-05-09 17:52:56 发布

阅读量2.3k

点赞数

分类专栏：算法 Python

本文链接：https://blog.csdn.net/IqqIqqIqqIqq/article/details/54561975

版权

Python 同时被 2 个专栏收录

45 篇文章 0 订阅

订阅专栏

算法

29 篇文章 0 订阅

订阅专栏

#encoding:utf-8
from collections import defaultdict
import sys
reload(sys) 
sys.setdefaultencoding('utf8') 
class LBTrie:  
    """ 
    simple implemention of Trie in Python.  
    """  
    def __init__(self):  
        self.trie = {}  
        self.size = 0  

    #添加单词   
    def add(self, word):  
        p = self.trie 
        dicnum = 0 
        word = word.strip()  
        for c in word:  
            if not c in p:  
                p[c] = {}
            dicnum+=1  
            p = p[c] 


        if word != '':  
            #在单词末尾处添加键值''作为标记，即只要某个字符的字典中含有''键即为单词结尾  
            p[''] = ''   
        if dicnum == len(word):
            return True
    #查询单词        
    def search(self, word):  
        p = self.trie  
        word = word.lstrip()  
        for c in word:  
            if not c in p:  
                return False  
            p = p[c]  
        #判断单词结束标记''  
        if '' in p:  
            return True  
        return False            

    #打印Trie树的接口  
    def output(self):  
        #print '{'  
        self.__print_item(self.trie)      
        #print '}'  
        return  self.__print_item(self.trie)

    #实现Trie树打印的私有递归函数，indent控制缩进  
    def __print_item(self, p, indent=0):       
        if p:  
            ind = '' + '\t' * indent  
            for key in p.keys():  
                label = "'%s' : " % key  
                print ind + label + '{'  
                self.__print_item(p[key], indent+1)

            print ind + ' '*len(label) + '}'    

def codeutil(strs):
         return strs.decode('utf8','ignore').encode('GBK','ignore').decode('GBK','ignore')

if __name__ == '__main__':  
    trie_obj = LBTrie()  
    #添加单词  
    corpus = open('content.txt','r')
    tree = open('tree.txt','w+')
    countdic = defaultdict(int)
    for record in corpus.readlines():
        recordlist = record.split(' ')
        for word in recordlist:
            check = trie_obj.add(codeutil(word))
            if check:
                countdic[word] += 1
    resortedcountdic = sorted(countdic.items(), key=lambda item: item[1], reverse=True)
    for tup in resortedcountdic:
     tree.write(''.join(codeutil(tup[0]))+'\t'+str(tup[1])+'\t')
    #查找单词       
    if trie_obj.search(codeutil('氨基酸')):  
        print 'Yes'  
    else:  
        print 'No'