基于openOffice和python实现office转pdf和html示例代码

本文介绍了如何通过Python脚本利用OpenOffice将Office文档转换为PDF或HTML格式。首先需要启动OpenOffice服务,然后使用Python的UNO桥接功能与OpenOffice实例交互,实现不同格式间的转换。文章提供了详细的脚本代码和转换配置,支持多种输入和输出格式。
摘要由CSDN通过智能技术生成

将office文件转化为html格式或者pdf格式

在转换之前,需要启动openOffice的服务:在openOffice目录下的命令窗口中执行soffice -headless -accept=”socket,host=127.0.0.1,port=8100;urp;” -nofirststartwizard即可启动
不知道如何启动的参照我的另外一篇文章

我电脑上安装的是python3.8
在这里插入图片描述
python的安装,在这里我就不多说了,在坐的老司机应该都熟悉了。

准备好了环境之后,话不多说,开始编写脚本。
脚本代码如下:

#
# PyODConverter (Python OpenDocument Converter) v1.1 - 2009-11-14
#
# This script converts a document from one office format to another by
# connecting to an OpenOffice.org instance via Python-UNO bridge.
#
# Copyright (C) 2008-2009 Mirko Nasato <mirko@artofsolving.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl-2.1.html
# - or any later version.
#
DEFAULT_OPENOFFICE_PORT = 8100

import uno
from os.path import abspath, isfile, splitext
from com.sun.star.beans import PropertyValue
from com.sun.star.task import ErrorCodeIOException
from com.sun.star.connection import NoConnectException

FAMILY_TEXT = "Text"
FAMILY_WEB = "Web"
FAMILY_SPREADSHEET = "Spreadsheet"
FAMILY_PRESENTATION = "Presentation"
FAMILY_DRAWING = "Drawing"

# ---------------------#
# Configuration Start #
# ---------------------#

# see http://wiki.services.openoffice.org/wiki/Framework/Article/Filter

# most formats are auto-detected; only those requiring options are defined here
IMPORT_FILTER_MAP = {
    "txt": {
        "FilterName": "Text (encoded)",
        "FilterOptions": "utf8"
    },
    "csv": {
        "FilterName": "Text - txt - csv (StarCalc)",
        "FilterOptions": "44,34,0"
    }
}

EXPORT_FILTER_MAP = {
    "pdf": {
        FAMILY_TEXT: {"FilterName": "writer_pdf_Export"},
        FAMILY_WEB: {"FilterName": "writer_web_pdf_Export"},
        FAMILY_SPREADSHEET: {"FilterName": "calc_pdf_Export"},
        FAMILY_PRESENTATION: {"FilterName": "impress_pdf_Export"},
        FAMILY_DRAWING: {"FilterName": "draw_pdf_Export"}
    },
    "html": {
        FAMILY_TEXT: {"FilterName": "HTML (StarWriter)"},
        FAMILY_SPREADSHEET: {"FilterName": "HTML (StarCalc)"},
        FAMILY_PRESENTATION: {"FilterName": "impress_html_Export"}
    },
    "odt": {
        FAMILY_TEXT: {"FilterName": "writer8"},
        FAMILY_WEB: {"FilterName": "writerweb8_writer"}
    },
    "doc": {
        FAMILY_TEXT: {"FilterName": "MS Word 97"}
    },
    "rtf": {
        FAMILY_TEXT: {"FilterName": "Rich Text Format"}
    },
    "txt": {
        FAMILY_TEXT: {
            "FilterName": "Text",
            "FilterOptions": "utf8"
        }
    },
    "ods": {
        FAMILY_SPREADSHEET: {"FilterName": "calc8"}
    },
    "xls": {
        FAMILY_SPREADSHEET: {"FilterName": "MS Excel 97"}
    },
    "csv": {
        FAMILY_SPREADSHEET: {
            "FilterName": "Text - txt - csv (StarCalc)",
            "FilterOptions": "44,34,0"
        }
    },
    "odp": {
        FAMILY_PRESENTATION: {"FilterName": "impress8"}
    },
    "ppt": {
        FAMILY_PRESENTATION: {"FilterName": "MS PowerPoint 97"}
    },
    "swf": {
        FAMILY_DRAWING: {"FilterName": "draw_flash_Export"},
        FAMILY_PRESENTATION: {"FilterName": "impress_flash_Export"}
    }
}

PAGE_STYLE_OVERRIDE_PROPERTIES = {
    FAMILY_SPREADSHEET: {
        # --- Scale options: uncomment 1 of the 3 ---
        # a) 'Reduce / enlarge printout': 'Scaling factor'
        "PageScale": 100,
        # b) 'Fit print range(s) to width / height': 'Width in pages' and 'Height in pages'
        # "ScaleToPagesX": 1, "ScaleToPagesY": 1000,
        # c) 'Fit print range(s) on number of pages': 'Fit print range(s) on number of pages'
        # "ScaleToPages": 1,
        "PrintGrid": False
    }
}


# -------------------#
# Configuration End #
# -------------------#

class DocumentConversionException(Exception):

    def __init__(self, message):
        self.message = message

    def __str__(self):
        return self.message


class DocumentConverter:

    def __init__(self, port=DEFAULT_OPENOFFICE_PORT):
        localContext = uno.getComponentContext()
        resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver",
                                                                         localContext)
        try:
            context = resolver.resolve("uno:socket,host=localhost,port=%s;urp;StarOffice.ComponentContext" % port)
        except NoConnectException:
            raise DocumentConversionException("failed to connect to OpenOffice.org on port %s" % port)
        self.desktop = context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)

    def convert(self, inputFile, outputFile):

        inputUrl = self._toFileUrl(inputFile)
        outputUrl = self._toFileUrl(outputFile)

        loadProperties = {"Hidden": True}
        inputExt = self._getFileExt(inputFile)
        if IMPORT_FILTER_MAP.has_key(inputExt):
            loadProperties.update(IMPORT_FILTER_MAP[inputExt])

        document = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self._toProperties(loadProperties))
        try:
            document.refresh()
        except AttributeError:
            pass

        family = self._detectFamily(document)
        self._overridePageStyleProperties(document, family)

        outputExt = self._getFileExt(outputFile)
        storeProperties = self._getStoreProperties(document, outputExt)

        try:
            document.storeToURL(outputUrl, self._toProperties(storeProperties))
        finally:
            document.close(True)

    def _overridePageStyleProperties(self, document, family):
        if PAGE_STYLE_OVERRIDE_PROPERTIES.has_key(family):
            properties = PAGE_STYLE_OVERRIDE_PROPERTIES[family]
            pageStyles = document.getStyleFamilies().getByName('PageStyles')
            for styleName in pageStyles.getElementNames():
                pageStyle = pageStyles.getByName(styleName)
                for name, value in properties.items():
                    pageStyle.setPropertyValue(name, value)

    def _getStoreProperties(self, document, outputExt):
        family = self._detectFamily(document)
        try:
            propertiesByFamily = EXPORT_FILTER_MAP[outputExt]
        except KeyError:
            raise DocumentConversionException( "unknown output format: '%s'" % outputExt)
        try:
            return propertiesByFamily[family]
        except KeyError:
            raise DocumentConversionException("unsupported conversion: from '%s' to '%s'" % (family, outputExt))

    def _detectFamily(self, document):
        if document.supportsService("com.sun.star.text.WebDocument"):
            return FAMILY_WEB
        if document.supportsService("com.sun.star.text.GenericTextDocument"):
            # must be TextDocument or GlobalDocument
            return FAMILY_TEXT
        if document.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
            return FAMILY_SPREADSHEET
        if document.supportsService("com.sun.star.presentation.PresentationDocument"):
            return FAMILY_PRESENTATION
        if document.supportsService("com.sun.star.drawing.DrawingDocument"):
            return FAMILY_DRAWING
        raise DocumentConversionException( "unknown document family: %s" % document)

    def _getFileExt(self, path):
        ext = splitext(path)[1]
        if ext is not None:
            return ext[1:].lower()

    def _toFileUrl(self, path):
        return uno.systemPathToFileUrl(abspath(path))

    def _toProperties(self, dict):
        props = []
        for key in dict:
            prop = PropertyValue()
            prop.Name = key
            prop.Value = dict[key]
            props.append(prop)
        return tuple(props)


if __name__ == "__main__":
    from sys import argv, exit

    if len(argv) < 3:
        print("USAGE: python %s <input-file> <output-file>" % argv[0])
        exit(255)
    if not isfile(argv[1]):
        print("no such input file: %s" % argv[1])
        exit(1)

    try:
        converter = DocumentConverter()
        converter.convert(argv[1], argv[2])
    except DocumentConversionException as exception:
        print("ERROR! " + str(exception))
        exit(1)
    except ErrorCodeIOException as exception:
        print("ERROR! ErrorCodeIOException %d" % exception.ErrCode)
        exit(1)

编写完上面的脚本之后要放到
在这里插入图片描述
因为如果不放openOffice的目录下很多类库都引用不到,会导致程序执行不了。
最后万事具备,只欠东风了。
打开cmd命令行,输入这个命令
在这里插入图片描述
可以把doc转为pdf,pdf文件已经生成了
在这里插入图片描述
输入这个命令
在这里插入图片描述
可以把doc转为html,html文件已经生成了
在这里插入图片描述
其他的office文件转pdf和html,大家可以按需自行尝试。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

reg183

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值