- 用到的模块是python的pdf2doc
pip install pdf2doc
- 实现方式也很简单,直接贴代码了
import os
from pdf2docx import Converter
def pdf_docx():
# 定义需要转换的PDF的根目录
pdf_path = r'D:\2024\2024_python_project\pdf'
# 定义转换为Doc文档的根目录
doc_path = r'D:\2024\2024_python_project\docs'
# 遍历PDF根目录获取所有pdf文件
for file in os.listdir(pdf_path):
# 获取文件后缀
suff_name = os.path.splitext(file)[1]
# 非pdf格式(后缀不为pdf)的文件不进行转换
if suff_name != '.pdf':
continue
# 获取pdf文件名称
file_name = os.path.splitext(file)[0]
# pdf文件路径
pdf_name = pdf_path + '\\' + file
# 要转换的docx文件路径
docx_name = doc_path + '\\' + file_name + '.docx'
# 将PDF文件转换为Doc
cv = Converter(pdf_name)
cv.convert(docx_name)
cv.close()
if __name__ == '__main__':
pdf_docx()
- 根据pdf页数的不同,转换时间也会不同,如图
[INFO] Start to convert XXX.pdf # PDF路径 [INFO] [1/4] Opening document... [INFO] [2/4] Analyzing document... [INFO] [3/4] Parsing pages... [INFO] (1/5) Page 1 [INFO] (2/5) Page 2 [INFO] (3/5) Page 3 [INFO] (4/5) Page 4 [INFO] (5/5) Page 5 [INFO] [4/4] Creating pages... [INFO] (1/5) Page 1 [INFO] (2/5) Page 2 [INFO] (3/5) Page 3 [INFO] (4/5) Page 4 [INFO] (5/5) Page 5 [INFO] Terminated in 2.60s. Process finished with exit code 0