统计文件夹内doc文档内容的词频(Python)

本代码要处理的文件名为“En”,本代码文件的上级目录与“En”放在同一个目录

程序思想是先筛选出doc后缀的文件,转换为docx文件保存,之后再读取文件内容,放在一个字典中,统计词频。

#coding:utf-8
import os
import docx
import win32com.client as wc

def main():
    #获取当前路径
    file_path = os.getcwd()

    #排除项
    excludes ={"the","a","of","is","in","to","for","and",
               "an","that","be","or","as","are","one","this",
               "each","may","with","we","not","2","e","name",
               "by","have","which","on","has","r","many","it",
               "other","can","use","you","some","will","more",
               "such","type","types","from"}

    #创建空字典
    counts = {}

    # 遍历所有文件
    for file in os.listdir(file_path):
        # 获取文件后缀
        suff_name = os.path.splitext(file)[1]

        #筛选doc后缀的文件
        if suff_name != '.doc':
            continue

        # 获取文件名称
        file_name = os.path.splitext(file)[0]

        #文件路径及完整名称
        doc_name = os.getcwd() + '\\' + file

        #将doc文件转换成docx文件再读取内容
        word = wc.Dispatch("Word.Application")
        doc = word.Documents.Open(doc_name)
        doc.SaveAs('{}x'.format(doc_name), 12)
        doc.Close()

        #获取每个文件的内容并切片
        hamletTxt = getDocx(file_name)
        words = hamletTxt.split()

        #记录每个词出现的频率
        for word in words:
            counts[word] = counts.get(word, 0) + 1

        #删除另存为的docx后缀的文件
        os.remove('{}x'.format(doc_name))

    #排除无用词
    for word in excludes:
       counts.pop(word)

    #字典元素保存至数组并按照值排序
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)

    #输出排序前20的数组元素
    for i in range(20):
        word, counts = items[i]
        print("{0:<10}  \t{1:>5}".format(word, counts))

def getDocx(file_name):
    text = docx.Document("{}.docx".format(file_name))
    txt = ""
    for i in text.paragraphs:
        txt = txt + i.text
    txt = txt.lower()
    for ch in '!"#@$%^&*()_+{}|;:`~/?.>,<=-':
        txt = txt.replace(ch, " ")
    return txt

#切换工作路径至要分析的doc文档所在目录
os.chdir("../En")

main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值