统计文件的字数,行数,高频词汇等
'''
Created on 2014/09/02
@author: wangz
'''
keep = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','-',"'"]
def normalize(s):
'''Convert s to normalized string.'''
result = ''
for c in s.lower():
if c in keep:
result += c
return result
def normalize2(s):
'''Convert s to normalized string.'''
return ''.join(c for c in s.lower() if c in keep)
def make_freq_dict(s):
'''Returns a dictionary whose keys are the words of s,
and whose values are the counts of those words.
'''
s = normalize(s)
words = s.split()
d={}
for w in words:
if w in d:
d[w] += 1
else:
d[w] = 1
return d
def print_file_stats(fname):
'''Print statistics for the given file.'''
s = open(fname,'r').read()
num_chars = len(s)
num_lines = s.count('\n')
d = make_freq_dict(s)
num_words = sum(d[w] for w in d)
lst = [(d[w],w) for w in d]
lst.sort()
lst.reverse()
print("The file '%s' has: "%fname)
print(" %s characters"%num_chars)
print(" %s lines"%num_lines)
print(" %s words"%num_words)
print("\nThe top 10 most frequent word are:")
i = 1
for count,word in lst[:10]:
print('%2s. %4s %s'%(i,count,word))
i += 1
inputfile = raw_input('input a file:')
print_file_stats(inputfile)