1.安装liboffice
sudo apt-get install libreoffice
sudo apt-get install libreoffice-l10n-zh-cn libreoffice-help-zh-cn
2.转换
libreoffice --headless --convert-to pdf 123.doc # doc转pdf
libreoffice --headless --convert-to pdf 1234.pptx # pptx转pdf
libreoffice --headless --convert-to pdf 1234.ppt # ppt转pdf
libreoffice --headless --convert-to docx 123.doc # doc转docx
libreoffice --headless --convert-to pptx 123.ppt # ppt转pptx
3.提取docx的文字
3.1 安装包
pip install python-docx
3.2 python提取
from docx import Document
path = r"./data/123.docx"
document = Document(path)
for paragraph in document.paragraphs:
print(paragraph.text)
4.pdf转png
4.1 安装包
pip install fitz
pip install PyMuPDF
4.2 pdf转png
# -*- coding: utf-8 -*-
import os
import fitz
# pdf保存为图片
def pdf_to_image(pdf_path, save_path=None, zoom_x=5, zoom_y=5, theta=0):
# 打开pdf文件
pdf = fitz.open(pdf_path)
# 逐页读取pdf
for pg in range(0, pdf.pageCount):
page = pdf[pg]
# 设置缩放和旋转系数
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(theta)
pm = page.get_pixmap(matrix=trans, alpha=False)
# 开始写图像
pm.save(os.path.join(save_path, "{}.png".format(pg)))
pdf.close()
if __name__ == '__main__':
pdf_path = r"./data/1234.pdf"
save_path = r"./data/save/pdf"
pdf_to_image(pdf_path, save_path)
5,pdf转txt
5.1 安装
pip install pdfplumber
5.2 使用
import pdfplumber
with pdfplumber.open("123.pdf") as pdf:
for page in pdf.pages:
print(page.extract_text())
6,pptx转pdf,汉字无法转码
msyh.ttf
cp msyh.ttf /usr/share/fonts/ # 微软雅黑 -> 系统
cd /usr/share/fonts/
fc-list :lang=zh //查看是否安装成功
/usr/share/fonts/MSYH.TTF: Microsoft YaHei:style=Regular,Normal