需求
- 一个PDF预览功能,由于前端预览太大的PDF时加载过慢,想要后端做PDF压缩。
- 对比后基于fitz,将PDF中的每页转换成图片
性能对比
- 源 PDF (Size: 27.9 MB | 3[图片页数] / 8[总页数] )
方法 | 压缩比例 | 处理速度 | 依赖项 | 缺点 |
---|---|---|---|---|
PyPDF2 | 27.6MB (1%) | 0.029s | 无 | 压缩比例低 |
fitz | 7.40MB (27.6%) | 2.9s | 无 | |
Ghostscript | 16MB (27.3%) | 5.645s | Ghostscript | 需要另裝工具 且需要命令行執行 |
aspose.pdf | 13.5 MB (27.4%) | 14.657s | 无 | 有水印 |
Spire.PDF | 6.24 MB (27.8%) | 15.17s | 无 | 有水印 |
实例代码
PyPDF2
import time
from PyPDF2 import PdfReader, PdfWriter
def compress_pdf_pypdf2(input_path, output_path):
reader = PdfReader(input_path)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
# 启用内容流压缩(无损)
writer.compress_content_streams = True
with open(output_path, "wb") as f:
writer.write(f)
if __name__ == "__main__":
start = time.time()
compress_pdf_pypdf2("input.pdf", "output_pypdf2.pdf")
print(time.time() - start)
aspose.pdf
- 缺点:商用 - 左上角的水印去不掉
# -*- coding: utf-8 -*-
import time
import aspose.pdf as ap
def Lossless_Compression(path1, path2):
"""
:param path1: 需要压缩的pdf文件路径
:param path2: 保存的pdf文件路径
:return: None
"""
compress_path = ap.Document(path1) # 需要压缩的pdf文件路径
optimize = ap.optimization.OptimizationOptions()
optimize.image_compression_options.compress_images = True
optimize.image_compression_options.image_quality = 90 # 压缩质量
compress_path.optimize_resources(optimize)
compress_path.save(path2) # 需要压缩后保存的文件路径
if __name__ == "__main__":
start = time.time()
Lossless_Compression("input.pdf", "output_aspose.pdf")
print(time.time() - start)
aspose.pdf
- 缺点:商用 - 左上角的水印去不掉
import time
from spire.pdf import *
def compress_pdf(input_pdf, output_pdf):
# 创建 PdfCompressor 对象,并设置压缩选项(设置图片质量、启用图片大小调整和压缩)
compressor = PdfCompressor(input_pdf)
compression_options = compressor.OptimizationOptions
compression_options.SetImageQuality(ImageQuality.Medium)
compression_options.SetResizeImages(True)
compression_options.SetIsCompressImage(True)
# 压缩 PDF 文件并保存到输出文件夹
compressor.CompressToFile(output_pdf)
if __name__ == "__main__":
start = time.time()
compress_pdf("input.pdf", "output_spire.pdf")
print(time.time() - start)
Ghostscript
- 调用外部工具,压缩比例高,支持多种优化算法。
- 依赖系统安装的Ghostscript,适合对压缩率要求高的场景。
#!/bin/bash
# Compression levels:
# 0: default - almost identical to /screen, 72 dpi images
# 1: prepress - high quality, color preserving, 300 dpi imgs
# 2: printer - high quality, 300 dpi images
# 3: ebook - low quality, 150 dpi images
# 4: screen - screen-view-only quality, 72 dpi images
gs -sDEVICE=pdfwrite \
-dCompatibilityLevel=1.5 \
-dNOPAUSE \
-dQUIET \
-dBATCH \
-sOutputFile=compressed.pdf input.pdf
fitz
import time
import fitz
def auto_dpi(page):
rect = page.rect
return max(72, int(300 * (rect.width / 800)))
def compress_pdf(path1, path2, dpi):
"""
压缩PDF函数(无需中间图片文件)
:param path1: 原始PDF路径
:param path2: 输出PDF路径
:param dpi: 输出分辨率(值越小文件越小,但越模糊)
"""
# 打开原始PDF
src_doc = fitz.open(path1)
# 创建新PDF对象
dst_doc = fitz.open()
for page in src_doc:
# dpi = auto_dpi(page)
# 生成图片矩阵
matrix = fitz.Matrix(dpi / 100.0, dpi / 100.0)
# 获取页面像素图(关闭alpha通道)
pix = page.get_pixmap(matrix=matrix, alpha=False)
if page.get_text("text"):
img_bytes = pix.tobytes(output="png")
else:
img_bytes = pix.tobytes(output="jpeg", jpg_quality=85)
# # 将像素图转为JPEG字节流(内存操作)
# img_bytes = pix.tobytes(output="jpeg", jpg_quality=90)
# 从内存字节流创建图片PDF
imgdoc = fitz.open("jpeg", img_bytes)
pdfbytes = imgdoc.convert_to_pdf()
imgdoc.close()
# 将单页PDF插入目标文档
pdfpage = fitz.open("pdf", pdfbytes)
dst_doc.insert_pdf(pdfpage)
pdfpage.close()
# 保存并关闭文档
if dst_doc.page_count > 0:
dst_doc.save(path2, deflate=True, garbage=3) # 启用压缩
dst_doc.close()
src_doc.close()
if __name__ == "__main__":
start = time.time()
zoom = 300 # 缩放比率
compress_pdf("input.pdf", "output_fitz.pdf", zoom)
print(time.time() - start)