python使用 wkhtmltopdf 和 pdfkit 批量加载html生成pdf,适用于博客备份和官网文档打包
wkhtmltopdf 是一个非常好的工具,它可以用适用于多平台的 html 到 pdf 的转换,pdfkit 是 wkhtmltopdf 的Python封装包
1.安装 wkhtmltopdf
1.1 windows 安装
https://wkhtmltopdf.org/downloads.html
下载版本 Windows (MinGW) 0.12.4 32-bit / 64-bit for Windows XP/2003 or later; standalone
添加路径 D:\Program Files\wkhtmltopdf\bin
需要重新打开cmd以及notepad++
1.2 centos 7 安装
https://download.csdn.net/download/qq_16089135/88488817
执行以下命令
yum install xorg-x11-fonts-75dpi
rpm -Uvh wkhtmltox-0.12.6-1.centos7.x86_64.rpm
上述方法安装后 wkhtmltopdf 提示 未找到命令,执行下面命令
mv /usr/local/bin/wkhtmltopdf /usr/bin/wkhtmltopdf
mv /usr/local/bin/wkhtmltoimage /usr/bin/wkhtmltoimage
然后校验
which wkhtmltopdf
/usr/bin/wkhtmltopdf
# which wkhtmltoimage
/usr/bin/wkhtmltoimage
如果新环境,可能还需要其他以来,可以参考:
https://liumapp.gitee.io/articles/2017/04/10/1491811668985.html/
1.3 安装 python 库
pip install pdfkit
API https://pypi.python.org/pypi/pdfkit
2 完整代码
from flask import Flask,request,make_response
import pdfkit
import logging
import requests
import json
import datetime
import os
import base64
from urllib.parse import quote
app = Flask(__name__)
save_pdf_dir = "./temp/"
def get_unique_id():
"""
根据时间生成唯一ID
:return:
"""
current_time = datetime.datetime.now()
str_time = current_time.strftime('%Y%m%d%H%M%S%f')[:-3]
unique_id = "{0}".format(str_time)
return unique_id
def to_weed(file_path):
img_data = None
try:
with open(file_path, 'rb') as f:
img_data = f.read()
except Exception as why:
logging.error("获取图片失败! : {}".format(str(why)))
return False
print('开始发送文件至weed');
try:
weed_url = "http://ip:9233/dir/assign"
r = requests.get(weed_url,data={},timeout=3)
if r.status_code == requests.codes.ok:
arr_data = r.json()
logging.info(arr_data)
files = {'file':('image.pdf',img_data)}
publicUrl = arr_data['publicUrl'].replace("ip:9081", "ip:9081")
res = requests.post(publicUrl + arr_data['fid']+"?&ttl=6M",files = files,timeout=10)
if res.status_code == 201:
pdf_url = ''.join([arr_data['publicUrl'],arr_data['fid'],'.pdf'])
return pdf_url
except Exception as why:
logging.error("存储weed失败! : {}".format(str(why)))
return False
@app.route('/file/html_to_pdf',methods = ['GET'])
def file_to_pdf():
return_data = {}
source_path = request.args.get('source_path')
print(source_path)
save_name = ''.join([get_unique_id(),'.pdf'])
save_file_path = save_pdf_dir + save_name
print(save_file_path)
return_data = {"code":0,"weed_url":"","msg":"转换失败"}
try:
trans_res = pdfkit.from_file(source_path, save_file_path, options={'enable-local-file-access':True})
except Exception as why:
logging.error("转换失败! : {}".format(str(why)))
return_data = {"code":0,"weed_url":"","msg":"转换失败"}
return json.dumps(return_data)
if trans_res:
weed_url = to_weed(save_file_path)
if weed_url:
return_data = {"code":200, "weed_url":weed_url, "msg":"转换成功"}
os.remove(save_file_path)
return json.dumps(return_data)
@app.route('/file/html_binary_to_pdf',methods = ['POST'])
def html_binary_to_pdf():
return_data = {}
html_file = request.files.get('file')
if html_file is None:
# 表示没有发送文件
return_data = {'status': 50000,'message': '文件上传失败'}
return json.dumps(return_data)
file_name = html_file.filename
suffix = os.path.splitext(file_name)[-1]#获取文件后缀
basePath = os.path.dirname(__file__) # 当前文件所在路径
html_save_name = ''.join([get_unique_id(),suffix])
html_save_path = save_pdf_dir + html_save_name
html_file.save(html_save_path)#保存文件
print(html_save_path)
print('保存html文件成功')
save_name = ''.join([get_unique_id(),'.pdf'])
save_file_path = save_pdf_dir + save_name
print(save_file_path)
return_data = {"code":0,"weed_url":"","msg":"转换失败"}
print('开始转换')
try:
trans_res = pdfkit.from_file(html_save_path, save_file_path, options={'enable-local-file-access':True, '--no-images': ''})
except Exception as why:
logging.error("转换失败! : {}".format(str(why)))
return_data = {"code":0,"weed_url":"","msg":"转换失败"}
return json.dumps(return_data)
print(trans_res)
print('转换结束')
if trans_res:
weed_url = to_weed(save_file_path)
print(weed_url)
if weed_url:
return_data = {"code":200, "weed_url":weed_url, "msg":"转换成功"}
os.remove(save_file_path)
os.remove(html_save_path)
return json.dumps(return_data)
@app.route('/img_proxy',methods = ['GET'])
def img_url_porxy():
source_path = request.args.get('img_uuid')
print(source_path)
file_name = request.args.get('file_name')
print('--------------')
print(file_name)
is_download = request.args.get('is_download')
print('=====================')
print(is_download)
if not is_download:
is_download = 0
if not source_path:
return "url is not null";
if not file_name:
return "file_name is not null";
view_array = ["jpg", "png", "pdf", "jpeg"]
#获取文件名后缀
ext = str(file_name.split(".")[1]).lower()
try:
image_data = requests.get(source_path).content
print('++++++++++++++++++++++++++++++++')
print(len(image_data))
file_name = quote(file_name.encode("utf-8"))
print("~~file_name={}".format(file_name))
resp = make_response(image_data)
if ext in view_array and is_download == 0:
#按照预览方式返回响应体
resp.headers['Content-Disposition'] = "filename={}".format(file_name)
if ext == "pdf":
resp.headers['Content-Type'] = 'application/pdf'
else:
resp.headers['Content-Type'] = 'image/jpg'
else:
#按照下载的方式返回响应体
resp.headers['Content-Type'] = 'application/force-download'
resp.headers['Content-Disposition'] = "attachment; filename={}".format(file_name)
except Exception as why:
return "get url error";
print(resp)
return resp
#图片文件直接上传weed的方法
@app.route('/file/to_weed',methods = ['POST'])
def binary_to_weed():
return_data = {}
html_file = request.form.get('image_base64')
file_name = request.form.get('file_name')
if html_file is None:
# 表示没有发送文件
return_data = {'status': 50000,'message': '文件上传失败'}
return json.dumps(return_data)
if file_name is None:
# 表示没有发送文件
return_data = {'status': 50000,'message': '缺少文件名'}
return json.dumps(return_data)
img_data = base64.b64decode(html_file)
try:
weed_url = "http://192.168.207.150:9233/dir/assign"
r = requests.get(weed_url,data={},timeout=3)
print(r.status_code)
if r.status_code == requests.codes.ok:
arr_data = r.json()
print(arr_data)
files = {file_name:img_data}
publicUrl = arr_data['publicUrl'].replace("ip:9081", "ip:9081")
res = requests.post(publicUrl+arr_data['fid']+"?&ttl=6M",files = files,timeout=10)
if res.status_code == 201:
pdf_url = ''.join([arr_data['publicUrl'],arr_data['fid']])
return_data = {"status":20000, "weed_url":pdf_url,"message":"成功" }
else :
return_data = {"status":50000, "message":"上传weed失败" }
return json.dumps(return_data)
except Exception as why:
logging.error("存储weed失败! : {}".format(str(why)))
return False
if __name__ == '__main__':
app.config['JSON_AS_ASCII'] = False
app.run(host='0.0.0.0', port=9999, debug=True)