习题一:对文件中的关键单词进行统计以及进行的代码优化。
素材要求: 一篇正常的英文文档。
正常实现的代码:
def makekey(s:str):
chars = set(r"""!'"#./\()[],*-""")
key = s.lower()
ret = []
for i,c in enumerate(key):
if c in chars:
ret.append(' ')
else:
ret.append(c)
return ''.join(ret).split()
#上面第一种方案:makekey代码效率低
#
#下面第二种方案:makekey1还可以优化:
def makekey1(s:str):
chars = set(r"""!'"#./\()[],*-""")
key = s.lower()
ret = []
start = 0
for i,c in enumerate(key):
if c in chars:
if start == i: #如果紧挨着还是特殊字符,start一定等于i.
start += 1 #加1并continue
continue
ret.append((key[start:i]))
start = i+1 #加1是跳过这个不需要的特殊字符c.
else:
if start < len(key): #小于,说明还有有效字符,程序需要一直执行到未尾。
ret.append(key[start:])
return ret
#------------------------------------------#
d= {}
with open('sample.txt',encoding='utf-8') as f:
for line in f:
words = line.split()
for wordlist in map(makekey1,words):
for word in wordlist:
d[word] = d.get(word,0) + 1
for i,(k,v) in enumerate(sorted(d.items(),key=lambda item:item[1],reverse=True),1):
if not i > 10:
print(i,k,v)
执行结果:
1 path 138
2 the 136
3 is 60
4 a 59
5 os 49
6 if 43
7 and 40
8 to 34
9 on 33
10 of 33
Process finished with exit code 0
下面是代码的优化:
#todo ===============下面是对上面代码中的makekey1进行优化========================#
#todo :删除上面的第一种方案:
#todo : 对第二种方案:makekey1进行优化:
#TODO : 排除字符可以有两种写法,各位随意。
# CHARS = set("""!'"#./\()[],*- \r\n\t""")
# def _makekey2(key:str,chars=CHARS):
def _makekey2(key: str, chars=set("""!'"#./\()[],*- \r\n\t""")): #如果使用多次,可以用上面的方案.
start = 0
for i,c in enumerate(key):
if c in chars:
if start == i: #如果紧挨着还是特殊字符,start一定等于i.
start += 1 #加1并continue
continue
# ret.append((key[start:i]))
yield key[start:i]
start = i+1 #加1是跳过这个不需要的特殊字符c.
else:
if start < len(key): #小于,说明还有有效字符,而且一直到未尾。
# ret.append(key[start:])
yield key[start:]
#todo :【下面是不区分大小写】:
def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict: #在这里加上过滤字符ignorewords.
"""此函数进行单词统计
"""
d= {}
with open('sample.txt',encoding=encoding) as f:
for line in f:
for word in map(str.lower,_makekey2(line)): #不区分大小写.
if word not in ignorewords:
d[word] = d.get(word,0) + 1
return d
# todo :【下面是区分大小写】:
# def wordcount(filename:str,encoding='utf-8',ignorewords=set())->dict:
# """此函数进行单词统计
#
# """
# d= {}
# with open('sample.txt',encoding=encoding) as f:
# for line in f:
# for word in _makekey2(line): #区分大小写.
# d[word] = d.get(word,0) + 1
#top10
def top(d:dict,n:int=10): #迭代出你要的top数据
for i,(k,v) in enumerate(sorted(d.items(),key = lambda item:item[1],reverse=True)):
if i >= n:
break
# print(k,v) #正常不需要打印.
yield k,v
for k,v in top(wordcount('sample',ignorewords={'the','is'})):
print(k,v)
执行结果:
path 138
a 59
os 49
if 43
and 40
to 34
on 33
of 33
return 30
windows 25
Process finished with exit code 0