Word直接导出的pdf不够清晰,使用打印导出的pdf又不带书签以及目录跳转功能这一问题,查阅网上资料使用Adobe DC似乎能够解决但是下载安装比较麻烦,于是写了python程序解决该问题。
解决思路: 使用python脚本对两个pdf文件进行合并,合并小体积的带书签的pdf和高清版本的pdf文件。
Step1:准备带书签的pdf,并命名为999.pdf (该名称与程序对应)
使用word导出功能,选择最小文件导出,并勾选书签导出选项。
Step2:打印输出高清pdf文件,并命名为a.pdf
打印使用福昕pdf的虚拟打印机,最高支持2400dpi的清晰度,我一般选1200dpi,已经足够清晰了。(也可以使用自带的Microsoft pdf 打印输出 但最高只支持600dpi)
打印导出文件,并命名为a.pdf
如果遇到导出的pdf文件太大可以不勾选word选项中的“不压缩文件中的图像”来进一步限制打印输出文件大小。
###使用Microsoft pdf打印输出文件也可以用代码替代执行,但最高只有600dpi,不如福昕pdf
import win32com.client
from pathlib import Path
import os
import time
def convert_docx_to_hd_pdf(input_docx, output_pdf):
# 确保输出路径存在
output_path = Path(output_pdf).parent
os.makedirs(output_path, exist_ok=True)
# 创建Word应用对象
word = win32com.client.DispatchEx("Word.Application")
word.Visible = True
try:
# 打开文档
doc = word.Documents.Open(str(Path(input_docx).resolve()))
# 获取文档总页数
total_pages = doc.ComputeStatistics(2) # 2 = wdStatisticPages
print(f"文档共 {total_pages} 页,开始转换...")
# 输出路径
output_pdf_abs = str(Path(output_pdf).resolve())
# 设置打印机为Microsoft Print to PDF
word.ActivePrinter = "Microsoft Print to PDF"
# 设置打印选项
word.Options.PrintBackground = True
word.Application.Options.PrintDraft = False
word.Application.Options.PrintProperties = False
# 执行打印
print("开始高质量打印...")
word.ActiveDocument.PrintOut(
OutputFileName=output_pdf_abs,
Range=0,
Item=0,
Copies=1,
Pages="",
PageType=0,
PrintToFile=True,
Collate=False
)
# 等待打印完成
print("正在等待打印完成...")
# 等待文件生成
max_wait = 30 # 最多等待30秒
start_time = time.time()
while time.time() - start_time < max_wait:
if os.path.exists(output_pdf) and os.path.getsize(output_pdf) > 0:
print(f"PDF导出完成: {output_pdf}")
print(f"文件大小: {os.path.getsize(output_pdf)} 字节")
return True
time.sleep(1)
print("等待超时,请检查输出文件")
return False
except Exception as e:
print(f"转换失败: {str(e)}")
return False
finally:
try:
doc.Close(SaveChanges=False)
except:
pass
word.Quit()
# 执行转换
if __name__ == "__main__":
convert_docx_to_hd_pdf('88.docx', '999.pdf') ## 88.docx为word文件名
总之会得到两个pdf文件如下
Step3:创建虚拟环境,并执行文件合并
conda create -n xxx python=3.9
conda activate xxx
cd xxxxx
pip install pikepdf pathlib os
然后运行程序,在此之前需在程序中指定目录页码范围,比如我的word目录对应9-10页
修改如下部分
运行
import pikepdf
from pathlib import Path
import os
start_page = 8
end_page = 9
def merge_pdfs_with_bookmarks(image_pdf_path, bookmark_pdf_path, output_pdf_path):
"""
合并两个PDF文件,保留第二个PDF的书签信息和目录页,其他页面使用第一个PDF的内容
参数:
image_pdf_path: 包含高质量图片的PDF路径
bookmark_pdf_path: 包含书签信息的PDF路径
output_pdf_path: 输出PDF的路径
"""
print(f"开始合并PDF文件...")
print(f"图片源PDF: {image_pdf_path}")
print(f"书签源PDF: {bookmark_pdf_path}")
try:
# 确保输出目录存在
output_dir = Path(output_pdf_path).parent
os.makedirs(output_dir, exist_ok=True)
# 打开两个PDF文件
with pikepdf.open(image_pdf_path) as image_pdf, pikepdf.open(bookmark_pdf_path) as bookmark_pdf:
# 检查页数是否一致
if len(image_pdf.pages) != len(bookmark_pdf.pages):
print(f"警告: 两个PDF的页数不一致! 图片PDF: {len(image_pdf.pages)}页, 书签PDF: {len(bookmark_pdf.pages)}页")
print("继续合并,但可能导致书签指向错误的页面")
# 创建一个新的PDF,以bookmark_pdf为基础
merged_pdf = pikepdf.Pdf.open(bookmark_pdf_path)
# 创建页面映射表,记录原始页面和新页面的对应关系
page_map = {}
# 替换除了第8-9页以外的所有页面
for i in range(len(merged_pdf.pages)):
# 页码从0开始,所以第9-10页对应索引8-9
if (i > end_page or i < start_page) and i < len(image_pdf.pages): # i != 10-12
# 保存原始页面的引用
old_page = merged_pdf.pages[i]
old_objgen = old_page.obj.objgen
# 使用正确的方法替换页面
# pikepdf不支持直接删除页面,但可以直接替换
merged_pdf.pages[i] = pikepdf.Page(image_pdf.pages[i])
# 记录页面映射关系
page_map[old_objgen[0]] = merged_pdf.pages[i].obj
# 修复文档内部链接(目录页链接)
print("正在修复文档内部链接...")
# 特别处理目录页(第9-10页,索引8-9)
for i in [start_page, end_page]:
if '/Annots' in merged_pdf.pages[i]:
annots = merged_pdf.pages[i]['/Annots']
if isinstance(annots, pikepdf.Array):
# print(annots)
for annot in annots:
# 检查是否是链接注释
if annot.get('/Subtype') == '/Link':
# 处理直接目标
if '/Dest' in annot:
dest = annot['/Dest']
if isinstance(dest, pikepdf.Array) and len(dest) > 0:
if hasattr(dest[0], 'objgen'):
ref_id = dest[0].objgen[0]
if ref_id in page_map:
dest[0] = page_map[ref_id]
# 处理动作目标
elif '/A' in annot and isinstance(annot['/A'], pikepdf.Dictionary) and '/D' in annot['/A']:
dest = annot['/A']['/D']
if isinstance(dest, pikepdf.Array) and len(dest) > 0:
if hasattr(dest[0], 'objgen'):
ref_id = dest[0].objgen[0]
if ref_id in page_map:
dest[0] = page_map[ref_id]
# 提取书签信息
bookmarks = []
if hasattr(bookmark_pdf, 'Root') and '/Outlines' in bookmark_pdf.Root:
print("正在提取书签信息...")
# 递归提取书签
def extract_bookmarks(outline, bookmarks, depth=0):
if '/First' not in outline:
return
current = outline['/First']
while True:
title = str(current.get('/Title', ''))
dest = current.get('/Dest', None)
page_num = 0
if dest is not None and isinstance(dest, pikepdf.Array) and len(dest) > 0:
# 查找目标页面
page_ref = dest[0]
for i, page in enumerate(bookmark_pdf.pages):
# 使用对象ID比较而不是same_as方法
if hasattr(page.obj, 'objgen') and hasattr(page_ref, 'objgen'):
if page.obj.objgen == page_ref.objgen:
page_num = i
break
# 创建书签项
bookmark = {
'title': title,
'page': page_num,
'children': []
}
bookmarks.append(bookmark)
# 处理子书签
if '/First' in current:
extract_bookmarks(current, bookmark['children'], depth + 1)
# 移动到下一个书签
if '/Next' not in current:
break
current = current['/Next']
# 提取所有书签
extract_bookmarks(bookmark_pdf.Root['/Outlines'], bookmarks)
print(f"提取了 {len(bookmarks)} 个顶级书签")
# 如果有提取到书签,添加到新PDF
if bookmarks:
print("正在将书签添加到新PDF...")
# 递归创建书签
def create_bookmarks(pdf, bookmarks, parent=None):
if not bookmarks:
return None
first = None
last = None
prev = None
for bookmark in bookmarks:
# 创建新书签
current = pdf.make_indirect(pikepdf.Dictionary({
'/Title': pikepdf.String(bookmark['title']),
'/Parent': parent
}))
# 设置目标页面
page_idx = bookmark['page']
if page_idx < len(pdf.pages):
dest = [pdf.pages[page_idx].obj, pikepdf.Name('/Fit')]
current['/Dest'] = pdf.make_indirect(pikepdf.Array(dest))
# 处理链接关系
if first is None:
first = current
if prev is not None:
prev['/Next'] = current
current['/Prev'] = prev
prev = current
last = current
# 处理子书签
if bookmark['children']:
children_first = create_bookmarks(pdf, bookmark['children'], current)
if children_first:
current['/First'] = children_first
# 找到最后一个子书签
children_last = children_first
while '/Next' in children_last:
children_last = children_last['/Next']
current['/Last'] = children_last
current['/Count'] = len(bookmark['children'])
return first
# 创建书签字典
outlines = merged_pdf.make_indirect(pikepdf.Dictionary({
'/Type': pikepdf.Name('/Outlines'),
'/Count': len(bookmarks)
}))
# 创建书签树
first = create_bookmarks(merged_pdf, bookmarks, outlines)
if first:
outlines['/First'] = first
# 找到最后一个书签
last = first
while '/Next' in last:
last = last['/Next']
outlines['/Last'] = last
# 添加到PDF
merged_pdf.Root['/Outlines'] = outlines
print("成功添加书签到新PDF")
# 保存合并后的PDF
merged_pdf.save(output_pdf_path)
print(f"PDF合并完成! 输出文件: {output_pdf_path}")
print(f"文件大小: {os.path.getsize(output_pdf_path)} 字节")
return True
except Exception as e:
print(f"合并PDF时出错: {str(e)}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
# 文件路径
image_pdf = ".\a.pdf" # 包含高质量图片的PDF
bookmark_pdf = ".\999.pdf" # 包含书签信息的PDF
output_pdf = ".\out.pdf" # 输出文件
# 执行合并
merge_pdfs_with_bookmarks(image_pdf, bookmark_pdf, output_pdf)
大功告成
该文件大小一般与高清pdf文件大小相当...
特别说明:该程序对正文中的图跳转未作程序编写。