一、安装flatpak
sudo yum install flatpak
flatpak remote-add --if-not-exists flathub https://flathub.org/repo/flathub.flatpakrepo
二、安装libreoffice
flatpak install flathub org.libreoffice.LibreOffice
三、使用
对于使用 flatpak 安装的 LibreOffice,不需要手动启动或设置任何环境变量。flatpak 提供了一个沙箱化的运行环境,确保应用程序可以正常运行。
flatpak 应用程序的可执行文件通常位于类似
/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice
只要在代码中正确指定了这个完整路径,就可以直接运行和调用 LibreOffice,而无需进行任何其他设置。
四、示例代码-doc
import tqdm
import subprocess
import os
from dotenv import load_dotenv
from docx.table import _Cell, Table
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.text.paragraph import Paragraph
from docx import Document
import re
load_dotenv()
libreoffice_path = "/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice"
def convert_doc_to_docx(doc_file_path):
if os.path.basename(doc_file_path).split(".")[1] == "docx":
return doc_file_path
elif os.path.basename(doc_file_path).split(".")[1] == "doc":
# Define the command to run LibreOffice in headless mode
command = [
libreoffice_path,
'--headless',
'--convert-to', 'docx',
'--outdir', os.path.dirname(doc_file_path),
doc_file_path
]
# Run the command
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Failed to convert '{doc_file_path}' to DOCX.\nError: {result.stderr}")
return doc_file_path.replace('.doc', '.docx')
else:
return False
def doc2text(filepath):
filepath = convert_doc_to_docx(filepath)
doc = Document(filepath)
resp = ""
def iter_block_items(parent):
from docx.document import Document
if isinstance(parent, Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("parse fail")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
for block in iter_block_items(doc):
if isinstance(block, Paragraph):
resp += block.text.strip() + "\n"
elif isinstance(block, Table):
for row in block.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
resp += paragraph.text.strip() + "\n"
resp = re.sub(r'\n+', '\n', resp)
resp = re.sub(r'(.)\1{4,}', r'\1', resp)
return {'document':resp,'metadata':filepath,'format':"docx_text"}
if __name__ == '__main__':
import json
file_path="/opt/rag/data/xxx.doc"
loader = doc2text(file_path)
output_path = os.path.join(os.path.dirname(file_path), os.path.basename(file_path).split(".")[0] + "_docx"+ ".json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(loader, f, ensure_ascii=False, indent=4)
五、示例代码-ppt
import os
import subprocess
from dotenv import load_dotenv
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import CompositeElement, Table
from unstructured.partition.pptx import partition_pptx
import json
load_dotenv()
libreoffice_path = "/var/lib/flatpak/app/org.libreoffice.LibreOffice/aarch64/stable/active/export/bin/org.libreoffice.LibreOffice"
def remove_duplicates(lst):
res = []
seen = {}
for i in lst:
if i not in seen:
seen[i] = 1
res.append(i)
return res
def ppt2text(file_name: str):
file_name = convert_ppt_to_pptx(file_name)
elements = partition_pptx(
filename=file_name,
multipage_sections=True,
infer_table_structure=True,
include_page_breaks=False,
)
chunks = chunk_by_title(
elements=elements,
multipage_sections=True,
combine_text_under_n_chars=0,
new_after_n_chars=None,
max_characters=4096,
)
data = dict()
text_list = []
for chunk in chunks:
if isinstance(chunk, CompositeElement):
text = chunk.text
text_list.append(text)
elif isinstance(chunk, Table):
if text_list:
text_list[-1] = text_list[-1] + "\n" + chunk.metadata.text_as_html
else:
text_list.append(chunk.hunk.metadata.text_as_html)
data['document'] = remove_duplicates(text_list)
data['metadata'] = file_name
data['format'] = "pptx_text"
return data
def convert_ppt_to_pptx(ppt_file_path):
if os.path.basename(ppt_file_path).split(".")[1] == "pptx":
return ppt_file_path
elif os.path.basename(ppt_file_path).split(".")[1] == "ppt":
# Define the command to run LibreOffice in headless mode
command = [
libreoffice_path,
'--headless',
'--convert-to', 'pptx',
'--outdir', os.path.dirname(ppt_file_path),
ppt_file_path
]
# Run the command
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Failed to convert '{ppt_file_path}' to PPTX.\nError: {result.stderr}")
return ppt_file_path.replace('.ppt', '.pptx')
else:
return False
if __name__ == "__main__":
pptx_file_path = "/opt/data/xxx.ppt"
contents = ppt2text(pptx_file_path)
# print(contents)
output_path = os.path.join(os.path.dirname(pptx_file_path), os.path.basename(pptx_file_path).split(".")[0] + "_ppt" + ".json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(contents, f, ensure_ascii=False, indent=4)