要优化合并Word文档的代码,使其保留原始样式和分页,我们需要更细致地处理文档内容的复制。以下是改进后的代码:
from docx import Document
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls
from docx.shared import Pt
import os
def merge_word_files(file1_path, file2_path, output_path):
"""
合并两个Word文档并保存为新的文档,保留原始样式和分页
参数:
file1_path (str): 第一个Word文件路径
file2_path (str): 第二个Word文件路径
output_path (str): 合并后的输出文件路径
"""
# 创建新的文档对象
merged_document = Document()
# 复制第一个文档的内容
copy_content(merged_document, file1_path)
# 在文档之间添加分页符(可选)
# merged_document.add_page_break()
# 复制第二个文档的内容
copy_content(merged_document, file2_path)
# 保存合并后的文档
merged_document.save(output_path)
print(f"文档已成功合并并保存到: {output_path}")
def copy_content(target_doc, source_path):
"""
将源文档的内容复制到目标文档中,保留原始样式
参数:
target_doc: 目标文档对象
source_path (str): 源文档文件路径
"""
source_doc = Document(source_path)
# 复制节属性(页面设置等)
for section in source_doc.sections:
target_section = target_doc.add_section(section.start_type)
target_section.orientation = section.orientation
target_section.page_width = section.page_width
target_section.page_height = section.page_height
target_section.left_margin = section.left_margin
target_section.right_margin = section.right_margin
target_section.top_margin = section.top_margin
target_section.bottom_margin = section.bottom_margin
target_section.header_distance = section.header_distance
target_section.footer_distance = section.footer_distance
target_section.gutter = section.gutter
# 复制所有段落和表格
for element in source_doc.element.body:
if element.tag.endswith('p'):
# 处理段落
paragraph = source_doc.paragraphs[source_doc.element.body.index(element)]
new_paragraph = target_doc.add_paragraph()
new_paragraph.style = paragraph.style
# 复制段落格式
new_paragraph_format = new_paragraph.paragraph_format
source_format = paragraph.paragraph_format
new_paragraph_format.alignment = source_format.alignment
new_paragraph_format.left_indent = source_format.left_indent
new_paragraph_format.right_indent = source_format.right_indent
new_paragraph_format.first_line_indent = source_format.first_line_indent
new_paragraph_format.space_before = source_format.space_before
new_paragraph_format.space_after = source_format.space_after
new_paragraph_format.line_spacing = source_format.line_spacing
new_paragraph_format.keep_together = source_format.keep_together
new_paragraph_format.keep_with_next = source_format.keep_with_next
new_paragraph_format.page_break_before = source_format.page_break_before
new_paragraph_format.widow_control = source_format.widow_control
# 复制段落内容
for run in paragraph.runs:
new_run = new_paragraph.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.name = run.font.name
new_run.font.size = run.font.size
new_run.font.color.rgb = run.font.color.rgb
elif element.tag.endswith('tbl'):
# 处理表格
table_index = [e for e in source_doc.element.body if e.tag.endswith('tbl')].index(element)
source_table = source_doc.tables[table_index]
# 创建新表格
new_table = target_doc.add_table(
rows=len(source_table.rows),
cols=len(source_table.columns),
style=source_table.style
)
# 复制表格内容
for i, row in enumerate(source_table.rows):
for j, cell in enumerate(row.cells):
new_cell = new_table.cell(i, j)
new_cell.text = cell.text
# 复制单元格格式
for paragraph in cell.paragraphs:
new_paragraph = new_cell.add_paragraph()
new_paragraph.style = paragraph.style
# 复制段落格式
new_paragraph_format = new_paragraph.paragraph_format
source_format = paragraph.paragraph_format
new_paragraph_format.alignment = source_format.alignment
new_paragraph_format.left_indent = source_format.left_indent
new_paragraph_format.right_indent = source_format.right_indent
new_paragraph_format.first_line_indent = source_format.first_line_indent
new_paragraph_format.space_before = source_format.space_before
new_paragraph_format.space_after = source_format.space_after
new_paragraph_format.line_spacing = source_format.line_spacing
# 复制运行格式
for run in paragraph.runs:
new_run = new_paragraph.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.name = run.font.name
new_run.font.size = run.font.size
new_run.font.color.rgb = run.font.color.rgb
# 使用示例
if __name__ == "__main__":
file1 = r"C:\Leon\python_project\oceanxecm\2025\04\20250422-AI-投标书\word\附件4_法定代表人身份证明书.docx" # 第一个Word文件路径
file2 = r"C:\Leon\python_project\oceanxecm\2025\04\20250422-AI-投标书\word\附件5_法定代表人授权委托书.docx" # 第二个Word文件路径
output = r"C:\Leon\python_project\oceanxecm\2025\04\20250422-AI-投标书\word\merged_document.docx" # 合并后的输出文件路径
merge_word_files(file1, file2, output)
这个优化后的代码有以下改进:
-
保留原始样式:复制段落和运行的格式,包括字体、大小、颜色、加粗、斜体等属性。
-
保留分页:通过复制段落格式中的
page_break_before
等属性,保持原始文档的分页。 -
处理表格:完整复制表格及其内容,保留表格样式和单元格格式。
-
复制节属性:复制页面设置、边距等节属性,确保文档布局一致。
-
更细致的处理:对文档中的每个元素进行更细致的处理,而不是简单的整体复制。
如果你需要在两个文档之间添加分页符,可以取消代码中merged_document.add_page_break()
的注释。
注意:对于非常复杂的Word文档(包含图表、页眉页脚等),可能需要更复杂的处理。这种情况下,可能需要考虑使用专业的文档处理库或工具。