词频统计

最新推荐文章于 2024-06-30 17:10:23 发布

twinkleJun

最新推荐文章于 2024-06-30 17:10:23 发布

阅读量113

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_37299352/article/details/102954373

版权

python 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

#中英文词频统计

import jieba
def hamlet(t):
    txt = open(t,"r").read()
    txt = txt.lower()
    for ch in "~!@#$%^&*()_+-=<>?/\,.:;[]|'""":
        txt = txt.replace(ch," ")
    words = txt.split()
    counts = {}
    for word in words:
        counts[word] = counts.get(word,0)+1
    items = list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)
    for i in range(10):
        word,counts = items[i]
        print("{0:<10}{1:>5}".format(word,counts))
def threekingdoms(t):
    txt = open(t,encoding="utf-8").read()
    words = jieba.lcut(txt)
    excludes = {"将军","却说","荆州","二人","不可","不能","如此",
            "商议","如何","主公","军士","左右","引兵","次日",
            "军马","大喜","天下","东吴","于是","今日","不敢",
            "魏兵","陛下","一人","都督","人马","不知","汉中",
            "只见","众将","后主"}
    counts = {}
    for word in words:
        if(len(word) == 1):
            continue
        elif(word=="玄德" or word=="玄德曰"):
            rewords="刘备"
        elif(word=="孔明" or word=="孔明曰"):
            rewords="诸葛亮"
        elif(word=="关公" or word=="云长"):
            rewords="关羽"
        elif (word == "孟德" or word == "丞相曰" or word == "丞相"):
            rewords = "曹操"
        else:
            rewords=word
        counts[rewords] = counts.get(rewords,0)+1
    for word in excludes:
        del counts[word]
    items = list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)
    for i in range(10):
        word,counts = items[i]
        print("{0:<10}{1:>5}".format(word,counts))
def main():
    filename = input("enter a filename:")
    if (filename == "hamlet"):
        hamlet(filename)
    elif (filename == "threekingdoms"):
        threekingdoms(filename)
    else:
        print("can't find the file!")
main()

twinkleJun

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
词频统计

#中英文词频统计import jiebadef hamlet(t): txt = open(t,"r").read() txt = txt.lower() for ch in "~!@#$%^&*()_+-=<>?/\,.:;[]|'""": txt = txt.replace(ch," ") words = txt.spli...
复制链接

扫一扫