词频统计python

def read_file(file):
    """接收文件名为参数,将文件中的内容读为字符串,
    只保留文件中的英文字母和西文符号,
    过滤掉中文(中文字符及全角符号Unicode编码都大于256)
    将所有字符转为小写,
    将其中所有标点、符号替换为空格,返回字符串
    """
    with open(file, 'r', encoding='utf-8') as data:
        string_sign = '!"\'-#$%&()*+,./:;<=>?@[\\]^_‘{|}~'
        string_txt = data.read()
        # print(string_txt)
        for letter in string_txt:
            if not letter.isascii():
                string_txt = string_txt.replace(letter, '')
        string_txt = string_txt.lower()
        # print(string_txt)
        # string_txt = string_txt.lower().replace('\n', ' ')
        # print(string_txt)
        for letter in string_txt:
            if letter in string_sign:
                string_txt = string_txt.replace(letter, ' ')
    return string_txt


def count_of_words(txt):
    """接收去除标点、符号的字符串,统计并返回其中单词数量和不重复的单词数量"""
    txt = txt.replace('\n', ' ')
    ls1 = txt.split(' ')
    for item in ls1.copy():
        if item.isspace():
            ls1.remove(item)
        elif item == '':
            ls1.remove(item)
        # elif item.isdigit():
        #     ls1.remove(item)
    print(ls1)
    print(set(ls1))
    return (len(ls1), len(set(ls1)))


def word_frequency(txt):
    """接收去除标点、符号的字符串,统计并返回每个单词出现的次数
    返回值为字典类型,单词为键,对应出现的次数为值"""
    # print(txt)
    txt = txt.replace('\n', ' ')
    ls1 = txt.split(' ')
    for item in ls1.copy():
        if item.isspace():
            ls1.remove(item)
        elif item == '':
            ls1.remove(item)
        elif item.isdigit():
            ls1.remove(item)
    # print(ls1)
    import collections
    ctr = collections.Counter(ls1)
    return dict(ctr)


def top_ten_words(frequency, cnt):
    """接收词频字典,输出出现次数最多的cnt个单词及其出现次数"""
    frequency = dict(sorted(frequency.items(), key=lambda x:x[1], reverse=True))
    # print(frequency)
    ls1 = list(frequency.keys())[:cnt]
    ls2 = list(frequency.values())[:cnt]
    for i in range(len(ls1)):
        print(f'{ls1[i]} {ls2[i]}')


def top_ten_words_no_excludes(frequency, cnt):
    """接收词频字典,去除常见的冠词、代词、系动词和连接词后,输出出现次数最多的
    cnt个单词及其出现次数,需排除的单词如下:
    excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we',
    'or', 'is', 'was', 'do', 'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her',
    'c','in', 'you', 'had','s', 'with', 'for', 't', 'but', 'as', 'not', 'they',
    'be', 'were', 'so', 'our','all', 'would', 'if', 'him', 'from', 'no', 'me',
    'could', 'when', 'there','them', 'about', 'this', 'their', 'up', 'been',
    'by', 'out', 'did', 'have']
    """
    excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we',
                      'or', 'is', 'was', 'do', 'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her',
                      'c', 'in', 'you', 'had', 's', 'with', 'for', 't', 'but', 'as', 'not', 'they',
                      'be', 'were', 'so', 'our', 'all', 'would', 'if', 'him', 'from', 'no', 'me',
                      'could', 'when', 'there', 'them', 'about', 'this', 'their', 'up', 'been',
                      'by', 'out', 'did', 'have']
    ls_keys = list(frequency.keys())
    for word in excludes_words:
        if word in ls_keys:
            del frequency[word]
    frequency = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
    ls1 = list(frequency.keys())[:cnt]
    ls2 = list(frequency.values())[:cnt]
    for i in range(len(ls1)):
        print(f'{ls1[i]} {ls2[i]}')


if __name__ == '__main__':
    filename = 'Who Moved My Cheese.txt'  # 文件名
    content = read_file(filename)  # 调用函数返回字典类型的数据
    frequency_result = word_frequency(content)  # 统计词频
    cmd = input()
    if cmd == '1':
        n = int(input())
        print(content[:n])
    elif cmd == '2':
        amount_results = count_of_words(content)
        print('文章共有单词{}个,其中不重复单词{}个'.format(*amount_results))
    elif cmd == '3':
        n = int(input())
        top_ten_words(frequency_result, n)
    elif cmd == '4':
        n = int(input())
        top_ten_words_no_excludes(frequency_result, n)

  • 4
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

二十四桥_

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值