Word文档合并工具保留样式与分页

要优化合并Word文档的代码,使其保留原始样式和分页,我们需要更细致地处理文档内容的复制。以下是改进后的代码:

from docx import Document
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls
from docx.shared import Pt
import os

def merge_word_files(file1_path, file2_path, output_path):
    """
    合并两个Word文档并保存为新的文档,保留原始样式和分页
    
    参数:
        file1_path (str): 第一个Word文件路径
        file2_path (str): 第二个Word文件路径
        output_path (str): 合并后的输出文件路径
    """
    # 创建新的文档对象
    merged_document = Document()
    
    # 复制第一个文档的内容
    copy_content(merged_document, file1_path)
    
    # 在文档之间添加分页符(可选)
    # merged_document.add_page_break()
    
    # 复制第二个文档的内容
    copy_content(merged_document, file2_path)
    
    # 保存合并后的文档
    merged_document.save(output_path)
    print(f"文档已成功合并并保存到: {output_path}")

def copy_content(target_doc, source_path):
    """
    将源文档的内容复制到目标文档中,保留原始样式
    
    参数:
        target_doc: 目标文档对象
        source_path (str): 源文档文件路径
    """
    source_doc = Document(source_path)
    
    # 复制节属性(页面设置等)
    for section in source_doc.sections:
        target_section = target_doc.add_section(section.start_type)
        target_section.orientation = section.orientation
        target_section.page_width = section.page_width
        target_section.page_height = section.page_height
        target_section.left_margin = section.left_margin
        target_section.right_margin = section.right_margin
        target_section.top_margin = section.top_margin
        target_section.bottom_margin = section.bottom_margin
        target_section.header_distance = section.header_distance
        target_section.footer_distance = section.footer_distance
        target_section.gutter = section.gutter
    
    # 复制所有段落和表格
    for element in source_doc.element.body:
        if element.tag.endswith('p'):
            # 处理段落
            paragraph = source_doc.paragraphs[source_doc.element.body.index(element)]
            new_paragraph = target_doc.add_paragraph()
            new_paragraph.style = paragraph.style
            
            # 复制段落格式
            new_paragraph_format = new_paragraph.paragraph_format
            source_format = paragraph.paragraph_format
            new_paragraph_format.alignment = source_format.alignment
            new_paragraph_format.left_indent = source_format.left_indent
            new_paragraph_format.right_indent = source_format.right_indent
            new_paragraph_format.first_line_indent = source_format.first_line_indent
            new_paragraph_format.space_before = source_format.space_before
            new_paragraph_format.space_after = source_format.space_after
            new_paragraph_format.line_spacing = source_format.line_spacing
            new_paragraph_format.keep_together = source_format.keep_together
            new_paragraph_format.keep_with_next = source_format.keep_with_next
            new_paragraph_format.page_break_before = source_format.page_break_before
            new_paragraph_format.widow_control = source_format.widow_control
            
            # 复制段落内容
            for run in paragraph.runs:
                new_run = new_paragraph.add_run(run.text)
                new_run.bold = run.bold
                new_run.italic = run.italic
                new_run.underline = run.underline
                new_run.font.name = run.font.name
                new_run.font.size = run.font.size
                new_run.font.color.rgb = run.font.color.rgb
                
        elif element.tag.endswith('tbl'):
            # 处理表格
            table_index = [e for e in source_doc.element.body if e.tag.endswith('tbl')].index(element)
            source_table = source_doc.tables[table_index]
            
            # 创建新表格
            new_table = target_doc.add_table(
                rows=len(source_table.rows),
                cols=len(source_table.columns),
                style=source_table.style
            )
            
            # 复制表格内容
            for i, row in enumerate(source_table.rows):
                for j, cell in enumerate(row.cells):
                    new_cell = new_table.cell(i, j)
                    new_cell.text = cell.text
                    
                    # 复制单元格格式
                    for paragraph in cell.paragraphs:
                        new_paragraph = new_cell.add_paragraph()
                        new_paragraph.style = paragraph.style
                        
                        # 复制段落格式
                        new_paragraph_format = new_paragraph.paragraph_format
                        source_format = paragraph.paragraph_format
                        new_paragraph_format.alignment = source_format.alignment
                        new_paragraph_format.left_indent = source_format.left_indent
                        new_paragraph_format.right_indent = source_format.right_indent
                        new_paragraph_format.first_line_indent = source_format.first_line_indent
                        new_paragraph_format.space_before = source_format.space_before
                        new_paragraph_format.space_after = source_format.space_after
                        new_paragraph_format.line_spacing = source_format.line_spacing
                        
                        # 复制运行格式
                        for run in paragraph.runs:
                            new_run = new_paragraph.add_run(run.text)
                            new_run.bold = run.bold
                            new_run.italic = run.italic
                            new_run.underline = run.underline
                            new_run.font.name = run.font.name
                            new_run.font.size = run.font.size
                            new_run.font.color.rgb = run.font.color.rgb

# 使用示例
if __name__ == "__main__":
    file1 = r"C:\Leon\python_project\oceanxecm\2025\04\20250422-AI-投标书\word\附件4_法定代表人身份证明书.docx"  # 第一个Word文件路径
    file2 = r"C:\Leon\python_project\oceanxecm\2025\04\20250422-AI-投标书\word\附件5_法定代表人授权委托书.docx"  # 第二个Word文件路径
    output = r"C:\Leon\python_project\oceanxecm\2025\04\20250422-AI-投标书\word\merged_document.docx"  # 合并后的输出文件路径

    merge_word_files(file1, file2, output)

这个优化后的代码有以下改进:

  1. 保留原始样式:复制段落和运行的格式,包括字体、大小、颜色、加粗、斜体等属性。

  2. 保留分页:通过复制段落格式中的page_break_before等属性,保持原始文档的分页。

  3. 处理表格:完整复制表格及其内容,保留表格样式和单元格格式。

  4. 复制节属性:复制页面设置、边距等节属性,确保文档布局一致。

  5. 更细致的处理:对文档中的每个元素进行更细致的处理,而不是简单的整体复制。

如果你需要在两个文档之间添加分页符,可以取消代码中merged_document.add_page_break()的注释。

注意:对于非常复杂的Word文档(包含图表、页眉页脚等),可能需要更复杂的处理。这种情况下,可能需要考虑使用专业的文档处理库或工具。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值