PDF文件页面抽取替换工具
功能:从源pdf文件压缩包中抽取pdf文件中的指定页码,替换至目标pdf文件压缩包中的pdf文件的指定页码,两个压缩包的文件之间通过文件名+正则表达式匹配。
技术实现:Python(基础库)、PyPDF2(pdf处理)、tkinker(GUI)、zipfile(压缩包解压)、re(文件名正则匹配)、os(文件操作)
示例图片:
如需更改pdf文件名匹配正则和替换页码,请联系我(2072850636@qq.com)获取并修改源码。
如需打包为.exe工具使用,可安装pyinstaller库并在命令行中输入pyinstaller -F main.py --noconsole
import re
import tkinter as tk
from tkinter import filedialog, messagebox
from PyPDF2 import PdfReader, PdfWriter
import zipfile
import os
# 解压缩函数
def unzip_file_with_encoding(zip_path, extract_path):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_infos = zip_ref.infolist()
for zip_info in zip_infos:
# 尝试使用不同的编码进行解码
try:
original_filename = zip_info.filename.encode('cp437').decode('utf-8')
except UnicodeDecodeError:
try:
original_filename = zip_info.filename.encode('cp437').decode('gbk') # 尝试使用GBK编码
except UnicodeDecodeError:
print(f"Cannot decode {zip_info.filename}, skipped.")
continue
# 构建新的文件路径:移除ZIP内部的顶级目录
new_filename = re.sub(r'^[^/]+/', '', original_filename)
zip_info.filename = new_filename # 更新zip_info中的filename为新路径
if new_filename: # 确保新路径不为空
zip_ref.extract(zip_info, extract_path)
# 扫描并替换指定页的函数
def replace_pages(source_path, target_path, pages_to_replace, output_dir):
unmatched_files = []
for source_file in os.listdir(source_path):
processed_name = re.sub(r'-新\.pdf$', '.pdf', source_file)
# 在目标目录中查找相似文件名的PDF
found = False
for target_file in os.listdir(target_path):
print(source_file,"----",target_file)
if processed_name == target_file:
found = True
source_pdf = PdfReader(os.path.join(source_path, source_file))
target_pdf = PdfReader(os.path.join(target_path, target_file))
output_file_path = os.path.join(output_dir, f"{processed_name}")
writer = PdfWriter()
print("writer创建成功!")
# 添加源PDF的第一页替换目标PDF的前三页
writer.add_page(source_pdf.pages[0]) # 添加源的第一页
for page_number in range(len(target_pdf.pages)):
if page_number not in pages_to_replace:
writer.add_page(target_pdf.pages[page_number])
writer.write(output_file_path)
print("Success!")
continue # 匹配到了就停止查找
if not found:
unmatched_files.append(processed_name)
return unmatched_files
# 示例:合并PDF文件
# def merge_pdfs(pdf_paths, output_path):
# pdf_writer = PdfWriter()
# for path in pdf_paths:
# pdf_reader = PdfReader(path)
# for page in range(len(pdf_reader.pages)):
# pdf_writer.add_page(pdf_reader.pages[page])
#
# with open(output_path, 'wb') as output_pdf:
# pdf_writer.write(output_pdf)
def select_zip_file():
file_path = filedialog.askopenfilename()
zip_entry.set(file_path)
def select_zip_file2():
file_path = filedialog.askopenfilename()
zip_entry2.set(file_path)
def select_specific_dir():
folder_path = filedialog.askdirectory()
specific_dir_entry.set(folder_path)
def start_operation(source_zip1, source_zip2, specific_dir):
# 解压第一个ZIP文件到指定目录的"source"子目录
source_path = os.path.join(specific_dir, "source")
unzip_file_with_encoding(source_zip1, source_path)
# 解压第二个ZIP文件到指定目录的"target"子目录
target_path = os.path.join(specific_dir, "target")
unzip_file_with_encoding(source_zip2, target_path)
output_dir = os.path.join(specific_dir, "solved_pdf")
if not os.path.exists(output_dir):
os.makedirs(output_dir) # 如果输出目录不存在,则创建
print("source_path",source_path)
print("target_path",target_path)
# 调用之前定义好的替换页码和未匹配文件提示逻辑
unmatched_files = replace_pages(source_path, target_path, [0, 1, 2], output_dir)
print("未处理的文件:",unmatched_files)
if unmatched_files:
messagebox.showinfo("完成", f"操作完成!存在未匹配的文件:\n{' '.join(unmatched_files)}")
else:
messagebox.showinfo("完成", "操作完成,所有文件均已匹配。")
app = tk.Tk()
app.title('PDF处理工具')
# 第一个ZIP文件路径
zip_entry = tk.StringVar(app)
frame1 = tk.Frame(app)
frame1.pack(padx=10, pady=5)
tk.Label(frame1, text="选择第一个ZIP文件(Source):").pack(side=tk.LEFT)
tk.Button(frame1, text="浏览", command=select_zip_file).pack(side=tk.LEFT)
tk.Entry(frame1, textvariable=zip_entry, width=50).pack(side=tk.LEFT)
# 第二个ZIP文件路径
zip_entry2 = tk.StringVar(app)
frame2 = tk.Frame(app)
frame2.pack(padx=10, pady=5)
tk.Label(frame2, text="选择第二个ZIP文件(Target):").pack(side=tk.LEFT)
tk.Button(frame2, text="浏览", command=select_zip_file2).pack(side=tk.LEFT)
tk.Entry(frame2, textvariable=zip_entry2, width=50).pack(side=tk.LEFT)
# 指定解压目录
specific_dir_entry = tk.StringVar(app)
frame3 = tk.Frame(app)
frame3.pack(padx=10, pady=5)
tk.Label(frame3, text="选择指定目录保存结果:").pack(side=tk.LEFT)
tk.Button(frame3, text="浏览", command=select_specific_dir).pack(side=tk.LEFT)
tk.Entry(frame3, textvariable=specific_dir_entry, width=50).pack(side=tk.LEFT)
# 开始操作按钮
start_button = tk.Button(app, text="开始", command=lambda: start_operation(zip_entry.get(), zip_entry2.get(),
specific_dir_entry.get(), ))
start_button.pack(pady=20)
app.mainloop()