这篇文章主要介绍了利用python进行英文词频统计,具有一定借鉴价值,需要的朋友可以参考下。希望大家阅读完这篇文章后大有收获,下面让小编带着大家一起了解一下。
1 __author__ = 'Oscar_Yang'
2 #-*- coding= utf-8 -*-
3 #copyRight by OSCAR
4 """
5 本脚本实现,合并几个英文文本,并且统计词频。6 脚本定义了几个函数:7 1、文件读取函数readFile(读取文件,输出每个文件的词频);8 2、元素为词频的字典的合并函数,并且实现相同词的词频相加,返回全部词频;9 3、调试部分,利用了高阶函数:map,reduce;10 4、最后实现格式化输出,输入结果如图片所示python小作品代码。11 """
12 importfunctools13 #定义文件读取函数,并且输出元素为词频的字典
14 defreadFile(file_name):15 y =[]16 with open(file_name, 'r',encoding="utf-8") as f:17 x=f.readlines()18 for line inx:19 y.extend(line.split())20 word_list2 =[]21
22 #单词格式化:去掉分词之后部分英文前后附带的标点符号
23 for word iny:24 #last character of each word
25 word1 =word26
27 #use a list of punctuation marks
28 whileTrue:29 lastchar = word1[-1:]30 if lastchar in [",", ".", "!", "?", ";", '"']:31 word2 =word1.rstrip(lastchar)32 word1 =word233 else:34 word2 =word135 break
36
37 whileTrue:38 firstchar =word2[0]39 if firstchar in [",", ".", "!", "?", ";", '"']:40 word3 =word2.lstrip(firstchar)41 word2 =word342 else:43 word3 =word244 break
45 #build a wordList of lower case modified words
46 word_list2.append(word3)47 #统计词频
48 tf ={}49 for word inword_list2:50 word =word.lower()51 #print(word)
52 word = ''.join(word.split())53 if word intf:54 tf[word] += 1
55 else:56 tf[word] = 1
57 returntf58
59 defget_counts(words):60 tf ={}61 for word inwords:62 word =word.lower()63 #print(word)
64 word = ''.join(word.split())65 if word intf:66 tf[word] += 1
67 else:68 tf[word] = 1
69
70
71 #合并两个字典的方法1
72 defmerge1(dic1, dic2):73 for k, v indic1.items():74 if k indic2.keys():75 dic2[k] +=v76 else:77 dic2[k] =v78 #print(dic2)
79 returndic280
81 #合并两个字典的方法2
82 defmerge2(dic1, dic2):83 from collections importCounter84 counts = Counter(dic1) +Counter(dic2)85 returncounts86
87 #获得前n个最热词和词频
88 def top_counts(word_list,n=10):89 value_key_pairs = sorted([(count, tz) for tz, count in word_list.items()],reverse=True)90 returnvalue_key_pairs[:n]91 #print(value_key_pairs[:n])
92
93 #测试部分
94 if __name__ == '__main__':95 file_list = [r'E:\graduate\Python\python那些事\articles\article_000.txt',96 r'E:\graduate\Python\python那些事\articles\article_001.txt',97 r'E:\graduate\Python\python那些事\articles\article_002.txt',98 r'E:\graduate\Python\python那些事\articles\article_003.txt',99 r'E:\graduate\Python\python那些事\articles\article_004.txt',100 r'E:\graduate\Python\python那些事\articles\article_005.txt']101
102 cc=map(readFile,file_list)103 word_list =functools.reduce(merge2,cc)104 top_counts=top_counts(word_list)105 #print(top_counts)
106 print ("最常用的单词排行榜:")107 for word in top_counts[0:10]:108 print("{0:10}{1}".format(word[1], word[0]))