直接上代码
#-*- encoding:utf-8 -*-
"""author:lgh
简单的doc转pdf,html,pdf转doc脚本
依赖库pdfminer3k,pip install pdfminer3k即可"""
from win32com.client importDispatch, constantsfrom pdfminer.pdfparser importPDFParser, PDFDocumentfrom pdfminer.pdfinterp importPDFResourceManager, PDFPageInterpreterfrom pdfminer.layout importLAParams, LTTextBoxHorizontalfrom pdfminer.converter importPDFPageAggregatorfrom pdfminer.pdfinterp importPDFTextExtractionNotAlloweddefdoc2pdf(input, output):
w= Dispatch('Word.Application')try:#打开文件
doc = w.Documents.Open(input, ReadOnly=1)#转换文件
doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF,
Item=constants.wdExportDocumentWithMarkup, CreateBookmarks =constants.wdExportCreateHeadingBookmarks)returnTrueexceptException as e:print(e)returnFalsefinally:
w.Quit(constants.wdDoNotSaveChanges)defdoc2html(input, output):
w= Dispatch('Word.Application')try:
doc= w.Documents.Open(input, ReadOnly=1)
doc.SaveAs(output,8)returnTrueexceptException as e:print(e)returnFalsefinally:
w.Quit(constants.wdDoNotSaveChanges)defpdf2doc(input, output):try:
with open(input,'rb') as f:
parser=PDFParser(f)
doc=PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)#设置初始化密码
doc.initialize()if notdoc.is_extractable:raisePDFTextExtractionNotAllowedelse:
rsrcmgr=PDFResourceManager()
laparams=LAParams()
device= PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter=PDFPageInterpreter(rsrcmgr, device)for page indoc.get_pages():
interpreter.process_page(page)
layout=device.get_result()for x inlayout:ifisinstance(x, LTTextBoxHorizontal):
with open(output,'a', encoding='utf-8') as f1:
results=x.get_text()
f1.write(results+'\n')returnTrueexceptException as e:print(e)returnFalsedefmain():#rc = doc2pdf(input, output)
#rc = doc2html(input, output)
input = r'F:\save_data\流畅的Python.pdf'output= r'F:\save_data\test.doc'rc=pdf2doc(input, output)ifrc:print('转换成功')else:print('转换失败')if __name__ == '__main__':
main()
以上其实是通过com来调用office API,其他语言貌似也可以
当然你也可以用上面的代码将word文件转换成任意格式文件(只要office 2007支持,比如将word文件转换成PDF文件,把8改成17即可),下面是office 2007支持的全部文件格式对应表:
wdFormatDocument = 0
wdFormatDocument97 = 0
wdFormatDocumentDefault = 16
wdFormatDOSText = 4
wdFormatDOSTextLineBreaks = 5
wdFormatEncodedText = 7
wdFormatFilteredHTML = 10
wdFormatFlatXML = 19
wdFormatFlatXMLMacroEnabled = 20
wdFormatFlatXMLTemplate = 21
wdFormatFlatXMLTemplateMacroEnabled = 22
wdFormatHTML = 8
wdFormatPDF = 17
wdFormatRTF = 6
wdFormatTemplate = 1
wdFormatTemplate97 = 1
wdFormatText = 2
wdFormatTextLineBreaks = 3
wdFormatUnicodeText = 7
wdFormatWebArchive = 9
wdFormatXML = 11
wdFormatXMLDocument = 12
wdFormatXMLDocumentMacroEnabled = 13
wdFormatXMLTemplate = 14
wdFormatXMLTemplateMacroEnabled = 15
wdFormatXPS = 18照着字面意思应该能对应到相应的文件格式,如果你是office 2003可能支持不了这么多格式。word文件转html有两种格式可选wdFormatHTML、wdFormatFilteredHTML(对应数字8、10),区别是如果是wdFormatHTML格式的话,word文件里面的公式等ole对象将会存储成wmf格式,而选用wdFormatFilteredHTML的话公式图片将存储为gif格式,而且目测可以看出用wdFormatFilteredHTML生成的HTML明显比wdFormatHTML要干净许多。
参考自https://blog.csdn.net/binger819623/article/details/6770932