03 HTML转PDF工具示例详解

云上凯歌

于 2023-10-31 17:11:27 发布

阅读量294

点赞数 4

分类专栏： python 文章标签： html pdf python

本文链接：https://blog.csdn.net/qq_16089135/article/details/134144908

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

python使用 wkhtmltopdf 和 pdfkit 批量加载html生成pdf，适用于博客备份和官网文档打包

wkhtmltopdf 是一个非常好的工具，它可以用适用于多平台的 html 到 pdf 的转换，pdfkit 是 wkhtmltopdf 的Python封装包

1.安装 wkhtmltopdf

1.1 windows 安装

https://wkhtmltopdf.org/downloads.html

下载版本 Windows (MinGW) 0.12.4 32-bit / 64-bit for Windows XP/2003 or later; standalone

添加路径 D:\Program Files\wkhtmltopdf\bin

需要重新打开cmd以及notepad++

1.2 centos 7 安装

https://download.csdn.net/download/qq_16089135/88488817

执行以下命令

 yum install xorg-x11-fonts-75dpi
 rpm -Uvh wkhtmltox-0.12.6-1.centos7.x86_64.rpm

上述方法安装后 wkhtmltopdf 提示未找到命令，执行下面命令

mv /usr/local/bin/wkhtmltopdf /usr/bin/wkhtmltopdf
mv /usr/local/bin/wkhtmltoimage /usr/bin/wkhtmltoimage

然后校验

 which wkhtmltopdf
/usr/bin/wkhtmltopdf
#  which wkhtmltoimage
/usr/bin/wkhtmltoimage

如果新环境，可能还需要其他以来，可以参考：

https://liumapp.gitee.io/articles/2017/04/10/1491811668985.html/

1.3 安装 python 库

pip install pdfkit

API https://pypi.python.org/pypi/pdfkit

2 完整代码

from flask import Flask,request,make_response
import pdfkit
import logging
import requests
import json
import datetime
import os
import base64
from urllib.parse import quote


app = Flask(__name__)

save_pdf_dir = "./temp/"

def get_unique_id():
    """
    根据时间生成唯一ID
    :return:
    """
    current_time = datetime.datetime.now()
    str_time = current_time.strftime('%Y%m%d%H%M%S%f')[:-3]
    unique_id = "{0}".format(str_time)
    return unique_id

def to_weed(file_path):
    img_data = None
    try:
        with open(file_path, 'rb') as f:
            img_data = f.read()
    except Exception as why:
        logging.error("获取图片失败! : {}".format(str(why)))
        return False
    print('开始发送文件至weed');
    try:        
        weed_url = "http://ip:9233/dir/assign"
        r = requests.get(weed_url,data={},timeout=3)
        if r.status_code == requests.codes.ok:
            arr_data = r.json()
            logging.info(arr_data)
            files = {'file':('image.pdf',img_data)}
            publicUrl = arr_data['publicUrl'].replace("ip:9081", "ip:9081")
            res = requests.post(publicUrl + arr_data['fid']+"?&ttl=6M",files = files,timeout=10)
            if res.status_code == 201:
                pdf_url = ''.join([arr_data['publicUrl'],arr_data['fid'],'.pdf'])
                return pdf_url
    except Exception as why:
        logging.error("存储weed失败! : {}".format(str(why)))
        return False

@app.route('/file/html_to_pdf',methods = ['GET'])
def file_to_pdf():
    return_data = {}
    source_path = request.args.get('source_path')
    print(source_path)
    save_name = ''.join([get_unique_id(),'.pdf'])
    save_file_path = save_pdf_dir + save_name 
    print(save_file_path)
    return_data = {"code":0,"weed_url":"","msg":"转换失败"}
    try:
        trans_res = pdfkit.from_file(source_path, save_file_path, options={'enable-local-file-access':True})
    except Exception as why:
        logging.error("转换失败! : {}".format(str(why)))
        return_data = {"code":0,"weed_url":"","msg":"转换失败"}
        return json.dumps(return_data)

    if trans_res:
        weed_url = to_weed(save_file_path)
        if weed_url:
            return_data = {"code":200, "weed_url":weed_url, "msg":"转换成功"}
            os.remove(save_file_path)
    
    return json.dumps(return_data)

@app.route('/file/html_binary_to_pdf',methods = ['POST'])
def html_binary_to_pdf():
    return_data = {}
    html_file = request.files.get('file')
    if html_file is None:
		# 表示没有发送文件
        return_data = {'status': 50000,'message': '文件上传失败'}
        return json.dumps(return_data)
    file_name = html_file.filename
    suffix = os.path.splitext(file_name)[-1]#获取文件后缀
    basePath = os.path.dirname(__file__)  # 当前文件所在路径
    html_save_name = ''.join([get_unique_id(),suffix])
    html_save_path = save_pdf_dir + html_save_name
    html_file.save(html_save_path)#保存文件
    print(html_save_path)
    print('保存html文件成功')
    save_name = ''.join([get_unique_id(),'.pdf'])
    save_file_path = save_pdf_dir + save_name 
    print(save_file_path)
    return_data = {"code":0,"weed_url":"","msg":"转换失败"}
    print('开始转换')
    try:
        trans_res = pdfkit.from_file(html_save_path, save_file_path, options={'enable-local-file-access':True, '--no-images': ''})
    except Exception as why:
        logging.error("转换失败! : {}".format(str(why)))
        return_data = {"code":0,"weed_url":"","msg":"转换失败"}
        return json.dumps(return_data)
    print(trans_res)
    print('转换结束')
    if trans_res:
        weed_url = to_weed(save_file_path)
        print(weed_url)
        if weed_url:
            return_data = {"code":200, "weed_url":weed_url, "msg":"转换成功"}
            os.remove(save_file_path)
            os.remove(html_save_path)
    
    return json.dumps(return_data)

@app.route('/img_proxy',methods = ['GET'])
def img_url_porxy(): 
    source_path = request.args.get('img_uuid')
    print(source_path)
    file_name = request.args.get('file_name')       
    print('--------------')
    print(file_name)
    is_download = request.args.get('is_download')
    print('=====================')
    print(is_download)
    if not is_download:
        is_download = 0
    if not source_path:
        return "url is not null";
    if not file_name:
        return "file_name is not null";

    view_array = ["jpg", "png", "pdf", "jpeg"]
    #获取文件名后缀
    ext = str(file_name.split(".")[1]).lower()

    try:
        image_data = requests.get(source_path).content
        print('++++++++++++++++++++++++++++++++')
        print(len(image_data))
        file_name = quote(file_name.encode("utf-8"))
        print("~~file_name={}".format(file_name))
        resp = make_response(image_data)
        if ext in view_array and is_download == 0:
            #按照预览方式返回响应体
            resp.headers['Content-Disposition'] = "filename={}".format(file_name)
            if ext == "pdf":
                resp.headers['Content-Type'] = 'application/pdf'
            else:
                resp.headers['Content-Type'] = 'image/jpg'
        else:
            #按照下载的方式返回响应体
            resp.headers['Content-Type'] = 'application/force-download'
            resp.headers['Content-Disposition'] = "attachment; filename={}".format(file_name)
    except Exception as why:
        return "get url error";
    print(resp)
    return resp

#图片文件直接上传weed的方法
@app.route('/file/to_weed',methods = ['POST'])
def binary_to_weed():
    return_data = {}
    html_file = request.form.get('image_base64')
    file_name = request.form.get('file_name')
    if html_file is None:
        # 表示没有发送文件
        return_data = {'status': 50000,'message': '文件上传失败'}
        return json.dumps(return_data)
    if file_name is None:
        # 表示没有发送文件
        return_data = {'status': 50000,'message': '缺少文件名'}
        return json.dumps(return_data)
    
    img_data = base64.b64decode(html_file)
    try:
        weed_url = "http://192.168.207.150:9233/dir/assign"
        r = requests.get(weed_url,data={},timeout=3)
        print(r.status_code)
        if r.status_code == requests.codes.ok:
            arr_data = r.json()
            print(arr_data)
            files = {file_name:img_data}
            publicUrl = arr_data['publicUrl'].replace("ip:9081", "ip:9081")
            res = requests.post(publicUrl+arr_data['fid']+"?&ttl=6M",files = files,timeout=10)
            if res.status_code == 201:
                pdf_url = ''.join([arr_data['publicUrl'],arr_data['fid']])
                return_data = {"status":20000, "weed_url":pdf_url,"message":"成功" }
            else :
                return_data = {"status":50000, "message":"上传weed失败" }
            
            return json.dumps(return_data)
    except Exception as why:
        logging.error("存储weed失败! : {}".format(str(why)))
        return False


if __name__ == '__main__':
    app.config['JSON_AS_ASCII'] = False
    app.run(host='0.0.0.0', port=9999, debug=True)