主要利用了Adobe Acrobat DC软件的能力,python环境的配置略过…
第一步:
安装win32com
pip install win32com
第二步:
下载Adobe Acrobat DC,软件本身是收费的,但是有万能的度娘,附个链接吧
提取码:8888
第三步:
执行下列代码
from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT
import os
import winerror
from time import sleep
ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
def pdf2word(f_path, d_path):
try:
AvDoc = Dispatch("AcroExch.AVDoc")
AvDoc.Open(f_path, "")
pdDoc = AvDoc.GetPDDoc()
jsObject = pdDoc.GetJSObject()
jsObject.SaveAs(d_path, "com.adobe.acrobat.html")
print('ok')
except Exception as e:
print('error')
print(e)
finally:
pdDoc.Close()
AvDoc.Close(True)
paths = os.walk(r'D:\AAA_DWP\files')
for path, dir_lst, file_lst in paths:
for file_name in file_lst:
if file_name.startswith("~$"):
continue
if file_name.endswith(".pdf"):
print('>>>>>>>>>>>>>>>>>>>> start:', file_name)
full_name = os.path.join(path, file_name)
out_file = full_name.replace('.pdf', '.html')
if os.path.exists(out_file):
continue
else:
f_path = full_name
d_path = 'D:\\AAA_DWP\\files\\output\\' + file_name.replace('.pdf', '.html')
pdf2word(f_path, d_path)
print('>>>>>>>>>>>>>>>>>>>> finish')
sleep(10)