# -*- coding: utf-8 -*-"""
Created on Mon Oct 03 11:07:58 2016
@author: liqi
"""
keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','-',"'"}
defnormalize(s):return''.join(c for c in s.lower() if c in keep)
defmake_freq_dict(s):
s = normalize(s)
words = s.split()
d = {}
for w in words:
if w in d:
d[w] += 1else:
d[w] = 1return d
defprint_file_stats(fname):
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('\n')
d = make_freq_dict(s)
num_words = sum(d[w] for w in d)
lst = [(d[w],w) for w in d]
lst.sort()
lst.reverse()
print("The file '%s' has:"% fname)
print(" %s characters" % num_chars)
print(" %s lines" % num_lines)
print(" %s words" % num_words)
print("\nThe top 10 most frequant words are:")
i = 1for count,word in lst[:20]:
print('%2s. %4s %s' %(i,count,word))
i += 1defmain():
print_file_stats('bill.txt')
if __name__ == '__main__':
main()
输出结果
The file 'bill.txt' has:
34426characters94 lines
6215words
The top 10 most frequant words are:
1.320the2.260 i
3.202and4.183to5.148of6.147 a
7.131 was
8.124in9.81my10.64 he
11.61for12.57 had
13.56that14.51it15.50with16.50me17.48 his
18.47on19.35 when
20.35but