import os
from pdf2docx import Converter
from win32com import client as wc
"""这里需要安转包pywin32com"""
# 读取pdf文件文本内容
def DocxToTxt(inputFinallyPath, outputFinallyPath):
wordhandle = wc.Dispatch("Word.Application")
wordhandle.Visible = 0 # 后台运行,不显示
wordhandle.DisplayAlerts = 0 # 不警告
doc = wordhandle.Documents.Open(inputFinallyPath)
doc.SaveAs(outputFinallyPath, 4) # txt=4, html=10, docx=16, pdf=17
doc.Close
if __name__ == '__main__':
# 输入路径
inputPath = r'D:\pythonproject\pdf_to_txt\input'
#输出路径,最好采用绝对路径
outputPath = r'D:\pythonproject\pdf_to_txt\output'
# 将文件夹的文件列举出来
pdfList = os.listdir(inputPath)
# 批量读取存储
pdf_num = 1
for li in pdfList:
print(li)
inputFinallyPath = inputPath + '/' + li
li = li.replace('.docx', '.txt')
outputFinallyPath = outputPath + '/' + li
DocxToTxt(inputFinallyPath, outputFinallyPath)
print('第 %d 篇docx已转换为txt' % pdf_num)
pdf_num = pdf_num + 1
print('共计%d篇docx文章已完全转换为txt' pdf_num-1))
python实现批量docx转txt
于 2023-04-04 21:02:46 首次发布