文件统计

统计文件的字数,行数,高频词汇等


'''
Created on 2014/09/02

@author: wangz
'''

keep = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','-',"'"]

def normalize(s):
    '''Convert s to normalized string.'''
    result = ''
    for c in s.lower():
        if c in keep:
            result += c
    return result

def normalize2(s):
    '''Convert s to normalized string.'''
    return ''.join(c for c in s.lower() if c in keep)

def make_freq_dict(s):
    '''Returns a dictionary whose keys are the words of s,
    and whose values are the counts of those words.
    '''
    
    s = normalize(s)
    words = s.split()
    d={}
    for w in words:
        if w in d:
            d[w] += 1
        else:
            d[w] = 1
    return d

def print_file_stats(fname):
    '''Print statistics for the given file.'''
    s = open(fname,'r').read()
    num_chars = len(s)
    num_lines = s.count('\n')
    d = make_freq_dict(s)
    num_words = sum(d[w] for w in d)
    
    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()
    
    print("The file '%s' has: "%fname)
    print(" %s characters"%num_chars)
    print(" %s lines"%num_lines)
    print(" %s words"%num_words)
    print("\nThe top 10 most frequent word are:")
    i = 1
    for count,word in lst[:10]:
        print('%2s. %4s %s'%(i,count,word))
        i += 1
        
inputfile = raw_input('input a file:')
print_file_stats(inputfile)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值