使用python脚本完成word转pdf(兼容linux)
参考:https://v3u.cn/a_id_96
起因:看到一个需求是用java把word转成pdf,在windows上使用Jacob可以实现,但linux上比较麻烦, 性能等综合考虑使用OpenOffice比较好。
感觉可以用java调用python脚本实现,这里做个记录。
在原博客中,作者在windows环境下使用了comtypes实现的转换,我本地换成了pywin32实现,另,增加了一个输出目录的参数,用于指定生成pdf的路径。
环境:本地:win10 + jdk1.8 + python3.7 linux服务器:centos7 + jdk1.8 + python3.6
jdk和python3的安装可百度,这里不做记录。
具体实现:
1)windows环境安装comtypes,用于脚本中判断是windows环境还是linux环境
2)windows环境安装pywin32库:pip install pywin32
3)linux中需要使用LibreOffice,安装依赖:
yum remove libreoffice-*
从https://www.libreoffice.org/download/download/上下载最新版本的linux rpm版本的LibreOffice,上传到自己的linux环境中,我这里下载的是LibreOffice_6.2.5_Linux_x86-64_ rpm.tar.gz
解压:tar -zxvf LibreOffice_6.2.5_Linux_x86-64_ rpm.tar.gz
cd LibreOffice_6.2.5.2_Linux_x86-64_rpm/RPMS
yum localinstall *.rpm
4)安装其他依赖:
yum install cairo cups-libs libSM
yum install ibus
yum install libreoffice-headless
5)查看是否安装成功:libreoffice -help
6)将window环境字体拷贝至linux中
windows文件夹 C:\\windows\\Fonts文件夹中所有内容拷贝至 linux中/usr/share/fonts/chinese文件夹下,没有就新建
7)修改字体缓存、权限
chmod -R 755 /usr/share/fonts/chinese
fc-cache -fv
fc-list | grep chinese //查看安装的新字体
8)linux下测试Libreoffice是否可用
libreoffice6.2 --headless --convert-to pdf /root/xxx.docx ------直接生成在了docx文件同目录下,有同名的文件会覆盖掉
libreoffice6.2 --headless --convert-to pdf /root/xxx.docx --outdir /root -------指定了输出路径
一切都OK的话,就可以执行脚本了:
# -*- coding: utf-8 -*-
# @Time : 2020/5/29 20:27
# @Author : LuckyKid
# @Site :
# @File : txt2pdf.py
# @Software: PyCharm
# -*- encoding: utf-8 -*-
import os, fitz
import subprocess
from docxtpl import DocxTemplate
import time
# pip install PyMuPDF
try:
from win32com import client
except ImportError:
client = None
demo_file_name = "./估计证明书_RedBook.docx"
current_file_path = os.path.dirname(os.path.abspath(demo_file_name)) + os.sep + 'data' + os.sep
def pyMuPDF_fitz(pdfPath, imagePath):
print("imagePath=" + imagePath)
pdfDoc = fitz.open(pdfPath)
page = pdfDoc[0]
rotate = int(0)
# 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
# 此处若是不做设置,默认图片大小为:792X612, dpi=96
zoom_x = 2 # (1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 2
mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pix = page.getPixmap(matrix=mat, alpha=False)
pix.writePNG(imagePath) # 将图片写入指定的文件夹内
def doc2pdf_linux(docPath):
cmd = 'libreoffice6.2 --headless --convert-to pdf'.split() + [docPath] + ['--outdir'] + [current_file_path]
p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
p.wait(timeout=30)
stdout, stderr = p.communicate()
if stderr:
raise subprocess.SubprocessError(stderr)
def doc2pdf(doc_name, pdf_name):
"""
:word文件转pdf
:param doc_name word文件名称
:param pdf_name 转换后pdf文件名称
"""
if not client:
return doc2pdf_linux(doc_name)
word = client.DispatchEx("Word.Application")
if os.path.exists(pdf_name):
os.remove(pdf_name)
worddoc = word.Documents.Open(doc_name, ReadOnly=1)
worddoc.SaveAs(pdf_name, FileFormat=17)
worddoc.Close()
return pdf_name
def data_2_world_by_demo(data_dic, save_file_name):
doc = DocxTemplate(demo_file_name) # 加载模板文件
doc.render(data_dic) # 填充数据
doc.save(save_file_name) # 保存目标文件
def data_2_png(description='', yearGroup='', vehicle_key='', reg_date='', msrp='', emission='', vin='', province='',
maintenance='', age='', mileage='', engine='', plate_number='', city='', insurance='', stand_rv='',
damage_1='', damage_2='', damage_3='', damage_rv='', option_1='', option_2='', option_3='', option_rv='',
wholesale_rv='', retail_rv='', id=''):
data_dict = {
'date': time.asctime(),
'description': description,
'yearGroup': yearGroup,
'vehicle_key': vehicle_key,
'reg_date': reg_date,
'msrp': msrp,
'emission': emission,
'vin': vin,
'province': province,
'maintenance': maintenance,
'age': age,
'mileage': mileage,
'engine': engine,
'plate_number': plate_number,
'city': city,
'insurance': insurance,
'stand_rv': stand_rv,
'damage_1': damage_1,
'damage_2': damage_2,
'damage_3': damage_3,
'damage_rv': damage_rv,
'option_1': option_1,
'option_2': option_2,
'option_3': option_3,
'option_rv': option_rv,
'wholesale_rv': wholesale_rv,
'retail_rv': retail_rv,
}
save_file_name = id + ".docx"
save_pdf_name = id + ".pdf"
save_png_name = id + ".png"
# 处理为绝对路径
save_file_name = current_file_path + save_file_name
save_pdf_name = current_file_path + save_pdf_name
save_png_name = current_file_path + save_png_name
# 根据模板把数据转world
data_2_world_by_demo(data_dict, save_file_name)
# 将world数据转pdf
doc2pdf(save_file_name, save_pdf_name)
# pdf转png
pyMuPDF_fitz(save_pdf_name, save_png_name)
# 删除world和pdf文件
os.remove(save_file_name)
os.remove(save_pdf_name)
return save_png_name
# 奔驰(进口) GLE 350 2020款 旅行车 5门 4MATIC 豪华型(特殊配置) 7座 手自一体 9速 四轮驱动 2.0直接喷射涡轮增压 (国 VI)
if __name__ == '__main__':
description = '2017款 旅行车 5门 xDrive35i 豪华型 手自一体 8速 四轮驱动 3.0直接喷射涡轮增压22222222222222222222222222'
yearGroup = 'MY2020/MY2020'
vehicle_key = 'BMW 18AA'
reg_date = '2020/4/27'
msrp = '157,3000'
emission = 'GB VI'
vin = 'SALGA3BU9LA5XXXX'
province = '广东'
maintenance = '是'
age = '2年'
mileage = '20,000公里'
engine = '2.0T'
plate_number = '粤AD123456'
city = '广州'
insurance = '无'
stand_rv = '950,000'
damage_1 = '前车门补漆'
damage_2 = '换发动机'
damage_3 = '水泡'
damage_rv = '21,000'
option_1 = '木纹内饰'
option_2 = '电动加热'
option_3 = 'GPS导航'
option_rv = '30,000'
wholesale_rv = '959,900'
retail_rv = '1,029,900'
id = '202008511213-128-BMW 19AB'
data_2_png(description, yearGroup, vehicle_key, reg_date, msrp, emission, vin, province, maintenance, age,
mileage, engine, plate_number, city, insurance, stand_rv, damage_1, damage_2, damage_3, damage_rv,
option_1, option_2, option_3, option_rv, wholesale_rv, retail_rv, id)