python 的练习题:对文件中的关键单词进行统计以及进行的代码优化

本文链接：https://blog.csdn.net/yzzsjc2008/article/details/80194013

习题一：对文件中的关键单词进行统计以及进行的代码优化。

素材要求：一篇正常的英文文档。

正常实现的代码：

def makekey(s:str):
    chars = set(r"""!'"#./\()[],*-""")
    key = s.lower()
    ret = []
    for i,c in enumerate(key):
        if c in chars:
            ret.append(' ')
        else:
            ret.append(c)
    return ''.join(ret).split()
#上面第一种方案：makekey代码效率低
#
#下面第二种方案：makekey1还可以优化：
def makekey1(s:str):
    chars = set(r"""!'"#./\()[],*-""")
    key = s.lower()
    ret = []
    start = 0

    for i,c in enumerate(key):
        if c in chars:
            if start == i: #如果紧挨着还是特殊字符，start一定等于i.
                start += 1 #加1并continue
                continue
            ret.append((key[start:i]))
            start = i+1    #加1是跳过这个不需要的特殊字符c.
    else:
        if start < len(key): #小于，说明还有有效字符，程序需要一直执行到未尾。
            ret.append(key[start:])

    return ret
#------------------------------------------#

d= {}
with open('sample.txt',encoding='utf-8') as f:
    for line in f:
        words = line.split()
        for wordlist in map(makekey1,words):
            for word in wordlist:
                d[word] = d.get(word,0) + 1

for i,(k,v) in enumerate(sorted(d.items(),key=lambda item:item[1],reverse=True),1):
    if not i > 10:
        print(i,k,v)

执行结果：

1 path 138
2 the 136
3 is 60
4 a 59
5 os 49
6 if 43
7 and 40
8 to 34
9 on 33
10 of 33

Process finished with exit code 0

下面是代码的优化：

#todo ===============下面是对上面代码中的makekey1进行优化========================#

#todo ：删除上面的第一种方案：
#todo : 对第二种方案：makekey1进行优化：
#TODO : 排除字符可以有两种写法,各位随意。
# CHARS = set("""!'"#./\()[],*- \r\n\t""")
# def _makekey2(key:str,chars=CHARS):
def _makekey2(key: str, chars=set("""!'"#./\()[],*- \r\n\t""")): #如果使用多次，可以用上面的方案.
    start = 0

    for i,c in enumerate(key):
        if c in chars:
            if start == i: #如果紧挨着还是特殊字符，start一定等于i.
                start += 1 #加1并continue
                continue
            # ret.append((key[start:i]))
            yield key[start:i]
            start = i+1    #加1是跳过这个不需要的特殊字符c.
    else:
        if start < len(key): #小于，说明还有有效字符，而且一直到未尾。
            # ret.append(key[start:])
            yield key[start:]


#todo ：【下面是不区分大小写】：
def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict:  #在这里加上过滤字符ignorewords.
    """此函数进行单词统计

    """
    d= {}
    with open('sample.txt',encoding=encoding) as f:
        for line in f:
            for word in map(str.lower,_makekey2(line)): #不区分大小写.
                if word not in ignorewords:
                    d[word] = d.get(word,0) + 1
    return d

# todo ：【下面是区分大小写】：
# def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict:
#     """此函数进行单词统计
#
#     """
#     d= {}
#     with open('sample.txt',encoding=encoding) as f:
#         for line in f:
#             for word in _makekey2(line): #区分大小写.
#                     d[word] = d.get(word,0) + 1

#top10
def top(d:dict,n:int=10):  #迭代出你要的top数据
    for i,(k,v) in enumerate(sorted(d.items(),key = lambda item:item[1],reverse=True)):
        if i >= n:
            break
        # print(k,v)  #正常不需要打印.
        yield k,v

for k,v in top(wordcount('sample',ignorewords={'the','is'})):
    print(k,v)

执行结果：

path 138
a 59
os 49
if 43
and 40
to 34
on 33
of 33
return 30
windows 25

Process finished with exit code 0