统计文件夹内doc文档内容的词频（Python）

最新推荐文章于 2023-08-13 15:43:53 发布

阿广 ້໌ᮨᮨ

最新推荐文章于 2023-08-13 15:43:53 发布

阅读量554

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_51285375/article/details/124419573

版权

本代码要处理的文件名为“En”，本代码文件的上级目录与“En”放在同一个目录

程序思想是先筛选出doc后缀的文件，转换为docx文件保存，之后再读取文件内容，放在一个字典中，统计词频。

#coding:utf-8
import os
import docx
import win32com.client as wc

def main():
    #获取当前路径
    file_path = os.getcwd()

    #排除项
    excludes ={"the","a","of","is","in","to","for","and",
               "an","that","be","or","as","are","one","this",
               "each","may","with","we","not","2","e","name",
               "by","have","which","on","has","r","many","it",
               "other","can","use","you","some","will","more",
               "such","type","types","from"}

    #创建空字典
    counts = {}

    # 遍历所有文件
    for file in os.listdir(file_path):
        # 获取文件后缀
        suff_name = os.path.splitext(file)[1]

        #筛选doc后缀的文件
        if suff_name != '.doc':
            continue

        # 获取文件名称
        file_name = os.path.splitext(file)[0]

        #文件路径及完整名称
        doc_name = os.getcwd() + '\\' + file

        #将doc文件转换成docx文件再读取内容
        word = wc.Dispatch("Word.Application")
        doc = word.Documents.Open(doc_name)
        doc.SaveAs('{}x'.format(doc_name), 12)
        doc.Close()

        #获取每个文件的内容并切片
        hamletTxt = getDocx(file_name)
        words = hamletTxt.split()

        #记录每个词出现的频率
        for word in words:
            counts[word] = counts.get(word, 0) + 1

        #删除另存为的docx后缀的文件
        os.remove('{}x'.format(doc_name))

    #排除无用词
    for word in excludes:
       counts.pop(word)

    #字典元素保存至数组并按照值排序
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)

    #输出排序前20的数组元素
    for i in range(20):
        word, counts = items[i]
        print("{0:<10}  \t{1:>5}".format(word, counts))

def getDocx(file_name):
    text = docx.Document("{}.docx".format(file_name))
    txt = ""
    for i in text.paragraphs:
        txt = txt + i.text
    txt = txt.lower()
    for ch in '!"#@$%^&*()_+{}|;:`~/?.>,<=-':
        txt = txt.replace(ch, " ")
    return txt

#切换工作路径至要分析的doc文档所在目录
os.chdir("../En")

main()