python编程:统计文件中出现次数最多的前10个词,并按出现次数排列它们。
下面我编写的这个程序可以用作:统计文件中所有的字符数、行数、出现次数最多的单词。
# -*- coding:utf-8 -*-
####首先把不相关的字符都去了,比如去除标点符号等
def normallize(s):
result = ''
for w in s.lower():
if w in keep:
result += w
return result
####其次划分字符串,然后得到多个单词,构建单词字典
def make_freq_dict(s):
s = normallize(s)
words = s.split()
# print word_num
dict = {}
for w in words:
if w in dict:
dict[w] +=1
else:
dict[w] = 1
return dict
####统计单词个数
def words_num(s):
d = make_freq_dict(s)
count = 0
for w in d:
count += d[w]
return count
####将字典转化为元组,然后根据元组排序
####需要注意元组排序的方法,首先排第一位,然后再排后面的
def words_order(s):
d = make_freq_dict(s)
lst = []
for w in d:
tuple_dict = (d[w], w)
lst.append(tuple_dict)
lst.sort()
lst.reverse()
return lst
if __name__ == "__main__":
# keep = "abcdefghijklmn-\'"
keep = {"a", "b", "c", "d", "e", "f", "g", "h", "i",
"j", "k", "l", "m", "n", "o", "p", "q", "r",
"s", "t", "u", "v", "w", "x", "y", "z", " ", "-", "\'"}
file_1 = open("bill", 'r').read()
characters_num = len(file_1)
lines_num = file_1.count("\n")
print characters_num
print lines_num
print words_num(file_1)
# file_1 = open("bill", 'r').read()
d = words_order(file_1)
i = 1
for count, word in d[:10]:
print i, count, word
i += 1
输出结果:
640
14
106
1 6 the
2 5 thy
3 4 to
4 3 and
5 2 world's
6 2 thou
7 2 thine
8 2 that
9 2 tender
10 2 self