抽出的频率最高的50个词

最新推荐文章于 2022-09-14 14:21:35 发布

嗜睡狂

最新推荐文章于 2022-09-14 14:21:35 发布

阅读量216

点赞数

分类专栏：【python】文章标签： Python

【python】专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import math
 
def compute_entropy(word_list):
        wdict={}
        tot_cnt=0
        for w in word_list:
                if w not in wdict:
                        wdict[w] = 0
                wdict[w] += 1
                tot_cnt+=1
        ent=0.0
        for k,v in wdict.items():
                p=1.0*v/tot_cnt
                ent -= p * math.log(p)
        return ent
 
def count_substr_freq():
        fp = open("./video.corpus")
        str_freq={}
        str_left_word={}
        str_right_word={}
        tot_cnt=http://www.funshionp.com/
        for line in fp:
                line=line.strip('n')
                st = line.decode('utf-8')
                l=len(st)
                for i in range(l):
                        for j in range(i+1,l):
                                if j - i  0:
                                                left_word=st[i-1]
                                        else:
                                                left_word='^'
                                        if j < l-1:                                                 right_word=st[j+1]                                         else:                                                 right_word='%'                                         str_left_word[w].append(left_word)                                         str_right_word[w].append(right_word)                                         tot_cnt+=1         for k,v in str_freq.items():                 if v >= 10:
                        left_ent=compute_entropy(str_left_word[k])
                        right_ent=compute_entropy(str_right_word[k])
                        print "%st%ft%ft%f"%(k,v*1.0/tot_cnt,left_ent,right_ent)
 
if __name__ == "__main__":
        count_substr_freq()