本代码要处理的文件名为“En”,本代码文件的上级目录与“En”放在同一个目录
程序思想是先筛选出doc后缀的文件,转换为docx文件保存,之后再读取文件内容,放在一个字典中,统计词频。
#coding:utf-8
import os
import docx
import win32com.client as wc
def main():
#获取当前路径
file_path = os.getcwd()
#排除项
excludes ={"the","a","of","is","in","to","for","and",
"an","that","be","or","as","are","one","this",
"each","may","with","we","not","2","e","name",
"by","have","which","on","has","r","many","it",
"other","can","use","you","some","will","more",
"such","type","types","from"}
#创建空字典
counts = {}
# 遍历所有文件
for file in os.listdir(file_path):
# 获取文件后缀
suff_name = os.path.splitext(file)[1]
#筛选doc后缀的文件
if suff_name != '.doc':
continue
# 获取文件名称
file_name = os.path.splitext(file)[0]
#文件路径及完整名称
doc_name = os.getcwd() + '\\' + file
#将doc文件转换成docx文件再读取内容
word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(doc_name)
doc.SaveAs('{}x'.format(doc_name), 12)
doc.Close()
#获取每个文件的内容并切片
hamletTxt = getDocx(file_name)
words = hamletTxt.split()
#记录每个词出现的频率
for word in words:
counts[word] = counts.get(word, 0) + 1
#删除另存为的docx后缀的文件
os.remove('{}x'.format(doc_name))
#排除无用词
for word in excludes:
counts.pop(word)
#字典元素保存至数组并按照值排序
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
#输出排序前20的数组元素
for i in range(20):
word, counts = items[i]
print("{0:<10} \t{1:>5}".format(word, counts))
def getDocx(file_name):
text = docx.Document("{}.docx".format(file_name))
txt = ""
for i in text.paragraphs:
txt = txt + i.text
txt = txt.lower()
for ch in '!"#@$%^&*()_+{}|;:`~/?.>,<=-':
txt = txt.replace(ch, " ")
return txt
#切换工作路径至要分析的doc文档所在目录
os.chdir("../En")
main()