参考文献:《Python编程入门》--Toby Donaldson 著
问题:
计算并打印文本文件内容的统计数据。我们想知道给定文本文件'bill.txt'包含多少个字符、行和单词。出单词数外,我们还想知道文件中出现次数最多的的前10个单词(可以排除掉一些功能词),并按出现次数排列它们。
#wordstats.py
#包含所有要保留的字符的集合
keep = {'a', 'b', 'c', 'd', 'e',
'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y',
'z',
' ', '-', "'"}
def normalize(s):
"""Convert s to a normalized string.
"""
result = ''
for c in s.lower():
if c in keep:
result += c
return result
#将字符串转化为次数字典
def make_freq_dict(s):
"""Return a dictionary whose keys are the words of s, and whose values
are the counts of those words.
"""
s = normalize(s)
words = s.split()
d = {}
for w in words:
if w in d:
d[w] += 1
else:
d[w] = 1
return d
def print_file_stats(fname):
"""print statistics for the given file.
"""
s = open(fname, 'r').read()
num_chars = len(s) #在规范化s之前计算字符数
num_lines = s.count('\n') #在规范化s之前计算行数
d = make_freq_dict(s)
num_words = sum(d[w] for w in d) #计算包含多少个单词
stop_words={'the','and','i','to','of','a','you','my','that','in'}
lst = []
for k in d:
if k in stop_words:
d[k]=0
pair = (d[k],k)
lst.append(pair)
lst.sort()
lst.reverse()
print("THe file '%s' has: " % fname)
print(" %d characters" % num_chars)
print(" %d lines" % num_lines)
print(" %d words" % num_words)
print("\nThe top 10 most frequent words are:")
i = 1
for count,word in lst[:10]:
print('%2d.%4d %s' % (i,count,word))
i += 1
def main():
print_file_stats('bill.txt')
if __name__ == '__main_':
main()
程序演示:
>>> main()
THe file 'bill1.txt' has:
525843 characters
10146 lines
87036 words
The top 10 most frequent words are:
1.1369 it
2.1064 was
3. 986 is
4. 954 he
5. 708 have
6. 663 for
7. 555 but
8. 531 not
9. 488 as
10. 466 me