高质量 PDF 文件转换
● 将 HTML 转换为 PDF
● 将 HTML 转换为 PDF
● 将图像转换为 PDF
● 将文本转换为 PDF
● 将 PDF 转换为 HTML
● 将 XPS 转换为 PDF
● 将 PDF 转换为 SVG
● 将 PDF 转换为 XPS
● 将 PDF 转换为图像
● 将 PDF 转换为 Word
● 将 PDF 转换为 Excel
安装
安装: pip install Spire.Pdf # 需要python高版本才能安装
pypi说明: https://pypi.org/project/Spire.Pdf/
使用教程
本文章中所有内容仅供学习交流使用,不用于其他任何目的, 不得使用违反相关法律法规
from spire.pdf.common import *
from html import unescape
from spire.pdf import *
from time import time
from lxml import html
import PyPDF2
import shutil
import re
import os
def clear_html_tag(filename):
“”“去除html里面的广告”“”
out_file = open(filename, “r”, encoding=“utf-8”)
html = out_file.read()
out_file.close()
html = re.sub((r"
r"The document was created with "
r";Spire.PDF for Python[\s\S]?
save_file = open(filename, “w”, encoding=“utf-8”)
html = save_file.write(html)
save_file.close()
def temp_dir():
“”“创建临时存放文件区”“”
if os.path.exists(“temp_file”):
shutil.rmtree(“temp_file”)
os.mkdir(“temp_file”)
temp_dir = str(int(time() * 1000000))
dir_root = f"temp_file/{temp_dir}"
if not os.path.exists(dir_root):
os.mkdir(dir_root)
return dir_root
def split_pdf(inputFile):
“”“分割pdf”“”
dir_root = temp_dir()
with open(inputFile, “rb”) as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(pdf_reader.pages[page_num])
with open(f"{dir_root}/{page_num+1}.pdf", “wb”) as output_pdf:
pdf_writer.write(output_pdf)
return dir_root
def html_page(title, body):
return “”"
def pdf_to_html_work(inputFile, outputFile):
“”“pdf转html工作函数”“”
doc = PdfDocument()
doc.LoadFromFile(inputFile)
doc.SaveToFile(outputFile, FileFormat.HTML)
doc.Close()
clear_html_tag(outputFile)
def clear_blank(html_string):
“”“去除空格字符”“”
new_string = list()
for line in html_string.split(“\n”):
if “> <” not in line:
new_string.append(line)
return “\n”.join(new_string)
def sort_file(directory_path):
“”“排序文件名”“”
file_names = os.listdir(directory_path)
file_names.sort(key=lambda x: int(x.split(‘.’)[0]))
return file_names
def pdf_to_html(inputFile, outputFile):
“”“pdf转html”“”
temp_pdf_dir = split_pdf(inputFile)
outHtmlFileName = os.path.splitext(os.path.basename(outputFile))[0]
for pdf_file in sort_file(temp_pdf_dir):
pdf_file_path = f"{temp_pdf_dir}/{pdf_file}"
html_file_path = pdf_file_path.replace(“.pdf”, “.html”)
print(“开始处理pdf转html:”, pdf_file_path)
pdf_to_html_work(pdf_file_path, html_file_path)
html_page_list = list()
for file_name in sort_file(temp_pdf_dir):
if file_name.endswith(“.html”):
html_file = open(f"{temp_pdf_dir}/{file_name}“,
“r”, encoding=“utf-8”)
html_code = html_file.read()
html_file.close()
parsed_html = html.fromstring(html_code)
body_content = parsed_html.find(‘.//body’)
html_code = unescape(html.tostring(body_content).decode())
html_code = re.sub(r”|", str(), html_code)
html_page_list.append(clear_blank(html_code))
html_page_body_code = “\n”.join(html_page_list)
with open(outputFile, “w”, encoding=“utf-8”) as f:
f.write(html_page(outHtmlFileName, html_page_body_code))
try:
shutil.rmtree(temp_pdf_dir)
except Exception as e:
pass
if name == “main”:
outputFile = “xxx.html”
inputFile = “xxxx.pdf”
pdf_to_html(inputFile, outputFile)
其他word和excel处理工具
word: https://pypi.org/project/Spire.Doc/
excel: https://pypi.org/project/Spire.XLS-for-Python/
ppt: https://pypi.org/project/Spire.Presentation/