PDF文件页面抽取替换工具

Hu_Zhitao

已于 2024-02-21 12:11:42 修改

阅读量307

点赞数 1

文章标签： pdf python

于 2024-02-21 11:51:02 首次发布

本文链接：https://blog.csdn.net/qq_43173865/article/details/136207805

版权

PDF文件页面抽取替换工具
功能：从源pdf文件压缩包中抽取pdf文件中的指定页码，替换至目标pdf文件压缩包中的pdf文件的指定页码，两个压缩包的文件之间通过文件名+正则表达式匹配。
技术实现：Python（基础库）、PyPDF2（pdf处理）、tkinker（GUI）、zipfile（压缩包解压）、re（文件名正则匹配）、os（文件操作）

示例图片：
在这里插入图片描述
如需更改pdf文件名匹配正则和替换页码，请联系我（2072850636@qq.com）获取并修改源码。
如需打包为.exe工具使用，可安装pyinstaller库并在命令行中输入pyinstaller -F main.py --noconsole

import re
import tkinter as tk
from tkinter import filedialog, messagebox
from PyPDF2 import PdfReader, PdfWriter
import zipfile
import os


# 解压缩函数
def unzip_file_with_encoding(zip_path, extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_infos = zip_ref.infolist()

        for zip_info in zip_infos:
            # 尝试使用不同的编码进行解码
            try:
                original_filename = zip_info.filename.encode('cp437').decode('utf-8')
            except UnicodeDecodeError:
                try:
                    original_filename = zip_info.filename.encode('cp437').decode('gbk')  # 尝试使用GBK编码
                except UnicodeDecodeError:
                    print(f"Cannot decode {zip_info.filename}, skipped.")
                    continue

            # 构建新的文件路径：移除ZIP内部的顶级目录
            new_filename = re.sub(r'^[^/]+/', '', original_filename)
            zip_info.filename = new_filename  # 更新zip_info中的filename为新路径

            if new_filename:  # 确保新路径不为空
                zip_ref.extract(zip_info, extract_path)


# 扫描并替换指定页的函数
def replace_pages(source_path, target_path, pages_to_replace, output_dir):
    unmatched_files = []
    for source_file in os.listdir(source_path):
        processed_name = re.sub(r'-新\.pdf$', '.pdf', source_file)
        # 在目标目录中查找相似文件名的PDF
        found = False
        for target_file in os.listdir(target_path):
            print(source_file,"----",target_file)
            if processed_name == target_file:
                found = True
                source_pdf = PdfReader(os.path.join(source_path, source_file))
                target_pdf = PdfReader(os.path.join(target_path, target_file))
                output_file_path = os.path.join(output_dir, f"{processed_name}")
                writer = PdfWriter()
                print("writer创建成功！")
                # 添加源PDF的第一页替换目标PDF的前三页
                writer.add_page(source_pdf.pages[0])  # 添加源的第一页
                for page_number in range(len(target_pdf.pages)):
                    if page_number not in pages_to_replace:
                        writer.add_page(target_pdf.pages[page_number])
                writer.write(output_file_path)
                print("Success!")
            continue  # 匹配到了就停止查找
    if not found:
        unmatched_files.append(processed_name)
    return unmatched_files


# 示例：合并PDF文件
# def merge_pdfs(pdf_paths, output_path):
#     pdf_writer = PdfWriter()
#     for path in pdf_paths:
#         pdf_reader = PdfReader(path)
#         for page in range(len(pdf_reader.pages)):
#             pdf_writer.add_page(pdf_reader.pages[page])
#
#     with open(output_path, 'wb') as output_pdf:
#         pdf_writer.write(output_pdf)


def select_zip_file():
    file_path = filedialog.askopenfilename()
    zip_entry.set(file_path)


def select_zip_file2():
    file_path = filedialog.askopenfilename()
    zip_entry2.set(file_path)


def select_specific_dir():
    folder_path = filedialog.askdirectory()
    specific_dir_entry.set(folder_path)


def start_operation(source_zip1, source_zip2, specific_dir):
    # 解压第一个ZIP文件到指定目录的"source"子目录
    source_path = os.path.join(specific_dir, "source")
    unzip_file_with_encoding(source_zip1, source_path)

    # 解压第二个ZIP文件到指定目录的"target"子目录
    target_path = os.path.join(specific_dir, "target")
    unzip_file_with_encoding(source_zip2, target_path)

    output_dir = os.path.join(specific_dir, "solved_pdf")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)  # 如果输出目录不存在，则创建
    print("source_path",source_path)
    print("target_path",target_path)

    # 调用之前定义好的替换页码和未匹配文件提示逻辑
    unmatched_files = replace_pages(source_path, target_path, [0, 1, 2], output_dir)
    print("未处理的文件：",unmatched_files)

    if unmatched_files:
        messagebox.showinfo("完成", f"操作完成！存在未匹配的文件：\n{' '.join(unmatched_files)}")
    else:
        messagebox.showinfo("完成", "操作完成，所有文件均已匹配。")


app = tk.Tk()
app.title('PDF处理工具')

# 第一个ZIP文件路径
zip_entry = tk.StringVar(app)
frame1 = tk.Frame(app)
frame1.pack(padx=10, pady=5)
tk.Label(frame1, text="选择第一个ZIP文件（Source）:").pack(side=tk.LEFT)
tk.Button(frame1, text="浏览", command=select_zip_file).pack(side=tk.LEFT)
tk.Entry(frame1, textvariable=zip_entry, width=50).pack(side=tk.LEFT)

# 第二个ZIP文件路径
zip_entry2 = tk.StringVar(app)
frame2 = tk.Frame(app)
frame2.pack(padx=10, pady=5)
tk.Label(frame2, text="选择第二个ZIP文件（Target）:").pack(side=tk.LEFT)
tk.Button(frame2, text="浏览", command=select_zip_file2).pack(side=tk.LEFT)
tk.Entry(frame2, textvariable=zip_entry2, width=50).pack(side=tk.LEFT)

# 指定解压目录
specific_dir_entry = tk.StringVar(app)
frame3 = tk.Frame(app)
frame3.pack(padx=10, pady=5)
tk.Label(frame3, text="选择指定目录保存结果:").pack(side=tk.LEFT)
tk.Button(frame3, text="浏览", command=select_specific_dir).pack(side=tk.LEFT)
tk.Entry(frame3, textvariable=specific_dir_entry, width=50).pack(side=tk.LEFT)

# 开始操作按钮
start_button = tk.Button(app, text="开始", command=lambda: start_operation(zip_entry.get(), zip_entry2.get(),
                                                                           specific_dir_entry.get(), ))
start_button.pack(pady=20)

app.mainloop()

Hu_Zhitao

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
PDF文件页面抽取替换工具

技术实现：Python（基础库）、PyPDF2（pdf处理）、tkinker（GUI）、zipfile（压缩包解压）、re（文件名正则匹配）、os（文件操作）功能：从源pdf文件压缩包中抽取pdf文件中的指定页码，替换至目标pdf文件压缩包中的pdf文件的指定页码，两个压缩包的文件之间通过文件名+正则表达式匹配。如需打包为工具使用，可安装pyinstaller库并在命令行中输入pyinstaller -F main.py --noconsole命令。
复制链接

扫一扫