PDF文件的批量更新与字体美化

原创于 2025-10-10 01:43:42 发布 · 238 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#数据库 #个人开发

在处理项目文档时，经常需要对文档进行批量更新与美化，这不仅包括文字内容的修改，还涉及到文档的格式与外观。今天我们将探讨如何使用Python来批量更新PDF文档中的文本，并确保新添加的文本与原文档的字体保持一致。

背景

假设你有一个项目，包含多个PDF文件，每个文件中的某些文本需要从状态"R"更新为"F"。然而，在更新过程中，新的文本在字体上与原文档不一致，导致视觉上不够统一。

解决方案

以下是我们将使用的Python脚本，它利用了PyMuPDF（fitz）库来操作PDF文件。

import os
import shutil
import fitz  # PyMuPDF
import re

def rename_and_copy_files():
    base_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'original')
    updated_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'updated')
    
    if not os.path.exists(updated_directory):
        os.makedirs(updated_directory)

    font_path = "Helvetica-Bold.ttf"
    if not os.path.isfile(font_path):
        print(f"Font file not found: {font_path}")
        return

    for filename in os.listdir(base_directory):
        if '_R_' in filename:
            new_filename = filename.replace('_R_', '_F_')
            src = os.path.join(base_directory, filename)
            dst = os.path.join(updated_directory, new_filename)
            print(f"Processing file: {filename}")
            if filename.endswith('.pdf'):
                update_pdf_text(src, dst, font_path)
            else:
                shutil.copy2(src, dst)
            print(f"Copied and renamed: {filename} to {new_filename}")

def update_pdf_text(src, dst, font_path):
    document = fitz.open(src)
    for page_num in range(len(document)):
        page = document[page_num]
        # 设置字体
        page.set_font("Helvetica-Bold", fontsize=10, fontfile=font_path)
        
        text_instances = page.search_for("_R_")
        for inst in text_instances:
            rect = fitz.Rect(inst)
            full_text, start_rect, end_rect = extract_full_name(page, rect)
            if not full_text:
                continue
            updated_text = full_text.replace('_R_', '_F_')
            page.draw_rect(fitz.Rect(start_rect.x0, start_rect.y0, end_rect.x1, end_rect.y1), color=(1, 1, 1), fill=(1, 1, 1))
            new_y = start_rect.y0 + 10
            page.insert_text((start_rect.x0, new_y), updated_text, fontsize=10, color=(0, 0, 0))

        single_r_instances = page.search_for(" R ")
        for inst in single_r_instances:
            rect = fitz.Rect(inst)
            page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
            new_y = rect.y0 + 10
            page.insert_text((rect.x0, new_y), "F", fontsize=10, color=(0, 0, 0))
    
    document.save(dst, garbage=4, deflate=True)
    document.close()

def extract_full_name(page, rect):
    full_text = ""
    start_rect = rect
    end_rect = rect
    words = page.get_text("words")
    name_pattern = re.compile(r'[A-Za-z0-9_\-]+')
    for word in words:
        word_text = word[4]
        if rect.intersects(fitz.Rect(word[:4])) and name_pattern.match(word_text):
            start_rect = fitz.Rect(word[:4]) if fitz.Rect(word[:4]).x0 < start_rect.x0 else start_rect
            end_rect = fitz.Rect(word[:4]) if fitz.Rect(word[:4]).x1 > end_rect.x1 else end_rect
            full_text += word_text
    return full_text, start_rect, end_rect

if __name__ == "__main__":
    rename_and_copy_files()
    print("Process finished")

关键点解释

字体设置：在update_pdf_text函数中，我们通过page.set_font("Helvetica-Bold", fontsize=10, fontfile=font_path)来确保新插入的文本使用与原文档一致的字体。
文本替换：脚本会搜索_R_和单个R，并将它们替换为_F_和F。
文件操作：脚本会检查并创建一个新的目录updated来存放更新后的文件。