首先,多个PDF文件的合并需要用到pypdf2库:
pip install PyPDF2
import PyPDF2
def merge_pdfs(input_pdfs, output_pdf):
# 创建一个PdfFileMerger对象
pdf_merger = PyPDF2.PdfFileMerger()
try:
# 逐个添加PDF文件
for pdf_file in input_pdfs:
with open(pdf_file, 'rb') as file:
pdf_merger.append(file)
# 合并并输出到新的PDF文件
with open(output_pdf, 'wb') as output_file:
pdf_merger.write(output_file)
print("PDF文件合并成功!")
except Exception as e:
print(f"发生错误:{e}")
# 用法示例
input_pdfs = ["file1.pdf", "file2.pdf", "file3.pdf"] # 替换为你的PDF文件路径列表
output_pdf = "merged_file.pdf" # 合并后的PDF文件名
merge_pdfs(input_pdfs, output_pdf)
但是,会出现如下报错:PyPDF2.errors.DeprecationError: PdfFileMerger is deprecated and was removed in PyPDF2 3.0.0. Use PdfMerger instead.
意思是pdfFileMerger函数在pypdf包里不存在了,需要换为PdfMerger函数。
下一步要转换为word文件并且要消除水印。
还需要之前我的博客里说到的python-docx库和PyMuPDF库:pip install PyPDF2 python-docx PyMuPDF
import PyPDF2
from docx import Document
import fitz # PyMuPDF
def merge_and_convert_pdfs_to_word(input_pdfs, output_word_file):
pdf_merger = PyPDF2.PdfMerger()
try:
# 合并多个PDF文件
for pdf_file in input_pdfs:
pdf_merger.append(pdf_file)
# 保存合并后的PDF文件
merged_pdf_file = "merged_file.pdf"
with open(merged_pdf_file, 'wb') as merged_pdf:
pdf_merger.write(merged_pdf)
# 打开合并后的PDF文件,消除水印
pdf_document = fitz.open(merged_pdf_file)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
page.clean_contents()
# 转换为Word文件
word_document = Document()
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
image_list = page.get_pixmap().getImageList()
for img_index, img in enumerate(image_list):
img_bytes = img.getImageData()
img_stream = BytesIO(img_bytes)
img = Image.open(img_stream)
img.save(f'image_{page_num + 1}_{img_index + 1}.png') # 保存为图片(可选)
# 将图片插入Word文档
word_document.add_picture(f'image_{page_num + 1}_{img_index + 1}.png')
# 保存Word文件
word_document.save(output_word_file)
print("PDF文件合并、转换为Word并消除水印成功!")
except Exception as e:
print(f"发生错误:{e}")
# 用法示例
input_pdfs = ["file1.pdf", "file2.pdf", "file3.pdf"] # 替换为你的PDF文件路径列表
output_word_file = "merged_and_converted_file.docx" # 替换为合并后的Word文件名
merge_and_convert_pdfs_to_word(input_pdfs, output_word_file)