kunpeng的aarch64架构cpu、openeuler系统、昇腾服务器适配文档转换功能（doc转docx、ppt转pptx）

内卷焦虑人士

于 2024-06-13 15:03:23 发布

阅读量350

点赞数 5

文章标签：服务器 powerpoint 人工智能

本文链接：https://blog.csdn.net/weixin_46398647/article/details/139654349

版权

一、安装flatpak

sudo yum install flatpak  
flatpak remote-add --if-not-exists flathub https://flathub.org/repo/flathub.flatpakrepo

二、安装libreoffice

flatpak install flathub org.libreoffice.LibreOffice

三、使用

对于使用 flatpak 安装的 LibreOffice,不需要手动启动或设置任何环境变量。flatpak 提供了一个沙箱化的运行环境,确保应用程序可以正常运行。
flatpak 应用程序的可执行文件通常位于类似

/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice

只要在代码中正确指定了这个完整路径,就可以直接运行和调用 LibreOffice,而无需进行任何其他设置。

四、示例代码-doc

import tqdm
import subprocess
import os
from dotenv import load_dotenv
from docx.table import _Cell, Table
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.text.paragraph import Paragraph
from docx import Document
import re

load_dotenv()
libreoffice_path = "/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice"
def convert_doc_to_docx(doc_file_path):
    if os.path.basename(doc_file_path).split(".")[1] == "docx":
        return doc_file_path
    elif os.path.basename(doc_file_path).split(".")[1] == "doc":
        
        # Define the command to run LibreOffice in headless mode
        command = [
            libreoffice_path,
            '--headless',
            '--convert-to', 'docx',
            '--outdir', os.path.dirname(doc_file_path),
            doc_file_path
        ]
        
        # Run the command
        result = subprocess.run(command, capture_output=True, text=True)
        
        if result.returncode != 0:
            raise RuntimeError(f"Failed to convert '{doc_file_path}' to DOCX.\nError: {result.stderr}")
        
        return doc_file_path.replace('.doc', '.docx')
    else:
        return False

def doc2text(filepath):
    filepath = convert_doc_to_docx(filepath)
    doc = Document(filepath)
    resp = ""

    def iter_block_items(parent):
        from docx.document import Document
        if isinstance(parent, Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("parse fail")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)

    for block in iter_block_items(doc):
        if isinstance(block, Paragraph):
            resp += block.text.strip() + "\n"
        elif isinstance(block, Table):
            for row in block.rows:
                for cell in row.cells:
                    for paragraph in cell.paragraphs:
                        resp += paragraph.text.strip() + "\n"
    resp = re.sub(r'\n+', '\n', resp)
    resp = re.sub(r'(.)\1{4,}', r'\1', resp)
    return {'document':resp,'metadata':filepath,'format':"docx_text"}

if __name__ == '__main__':
    import json
    file_path="/opt/rag/data/xxx.doc"
    loader = doc2text(file_path)
    output_path = os.path.join(os.path.dirname(file_path), os.path.basename(file_path).split(".")[0] + "_docx"+ ".json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(loader, f, ensure_ascii=False, indent=4)

五、示例代码-ppt

import os
import subprocess
from dotenv import load_dotenv
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import CompositeElement, Table
from unstructured.partition.pptx import partition_pptx
import json

load_dotenv()
libreoffice_path = "/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice"

def remove_duplicates(lst):
    res = []
    seen = {}
    for i in lst:
        if i not in seen:
            seen[i] = 1
            res.append(i)
    return res

def ppt2text(file_name: str):
    file_name = convert_ppt_to_pptx(file_name)
    elements = partition_pptx(
        filename=file_name,
        multipage_sections=True,
        infer_table_structure=True,
        include_page_breaks=False,
    )

    chunks = chunk_by_title(
        elements=elements,
        multipage_sections=True,
        combine_text_under_n_chars=0,
        new_after_n_chars=None,
        max_characters=4096,
    )
    data = dict()
    text_list = []

    for chunk in chunks:
        if isinstance(chunk, CompositeElement):
            text = chunk.text
            text_list.append(text)
        elif isinstance(chunk, Table):
            if text_list:
                text_list[-1] = text_list[-1] + "\n" + chunk.metadata.text_as_html
            else:
                text_list.append(chunk.hunk.metadata.text_as_html)
    data['document'] = remove_duplicates(text_list)
    data['metadata'] =  file_name
    data['format'] =  "pptx_text"
    return data

def convert_ppt_to_pptx(ppt_file_path):
    if os.path.basename(ppt_file_path).split(".")[1] == "pptx":
        return ppt_file_path
    elif os.path.basename(ppt_file_path).split(".")[1] == "ppt":
        # Define the command to run LibreOffice in headless mode
        command = [
            libreoffice_path,
            '--headless',
            '--convert-to', 'pptx',
            '--outdir', os.path.dirname(ppt_file_path),
            ppt_file_path
        ]
        
        # Run the command
        result = subprocess.run(command, capture_output=True, text=True)
        
        if result.returncode != 0:
            raise RuntimeError(f"Failed to convert '{ppt_file_path}' to PPTX.\nError: {result.stderr}")
        
        return ppt_file_path.replace('.ppt', '.pptx')
    else:
        return False

if __name__ == "__main__":
    pptx_file_path = "/opt/data/xxx.ppt"
    contents = ppt2text(pptx_file_path)
    # print(contents)

    output_path = os.path.join(os.path.dirname(pptx_file_path), os.path.basename(pptx_file_path).split(".")[0] + "_ppt" + ".json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(contents, f, ensure_ascii=False, indent=4)