Python 中英文词频统计(字典类型 jieba库应用)

最新推荐文章于 2023-12-20 05:50:02 发布

宗介l

最新推荐文章于 2023-12-20 05:50:02 发布

阅读量1k

点赞数

本文链接：https://blog.csdn.net/qq_43296197/article/details/99989572

版权

def Get_Text():
    txt=open("hamlet.txt","r").read()
    txt=txt.lower()#将全文变为小写
    for ch in '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~''':#将文章中的特殊字符用空格代替
        txt=txt.replace(ch," ")
    return txt

hamlet_txt = Get_Text()
words = hamlet_txt.split()#拆分字符串。可以指定分隔符对字符串进行切片，并返回分割后的字符串结果，该结果是一个列表
                        #当不给split函数传递任何参数时，分隔符sep会采用任意形式的空白字符：空格、tab、换行、回车以及formfeed。
                        #split('\n')只有换行被作为分隔符
counts={}
for word in words:
    counts[word]=counts.get(word,0)+1#创建字典类型 并通过历史查询和 +1 实现单词和次数的统计匹配
it = list(counts.items())#将字典类型转化为列表
Len= len(it)
it.sort(key=lambda x:x[1],reverse=True)#key=lambda x:x[1]指按照it的第一维的数据为基准进行排序  reverse=True倒序排列
                                        #PS: 'dict' object has no attribute 'sort'
for i in range(10):
    word,count=it[i]
    print("{0:<10}{1:>5}".format(word,count))
print(it)

#对于中文文章的词频统计 可以利用jieba库进行
'''
import jieba
txt=open("hhhhhh.txt","r",encoding="utf-8").read()
words=jieba.lcut(txt)#等价于上面的words jieba精确模式 返回一个列表类型的分词结果
excludes={"具有","努克"}
for word in words:
    if len(word)==1:#可以在遍历中将单个的词 如的 了 你 排除在外
        continue
    else:
        counts[word]=counts.get(word,0)+1

for word in excludes:
    del counts[word]#前面给出字典”黑名单“ 这里通过for将黑名单中的单词删除
'''


#dict_items([('the', 1143), ('tragedy', 3), ('o
#[('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i',