python—批量删除pdf页面

适应_Adapt

已于 2024-09-25 02:17:51 修改

阅读量797

点赞数 5

文章标签： python pdf 开发语言

于 2023-07-10 01:01:32 首次发布

本文链接：https://blog.csdn.net/qq_51790672/article/details/131630165

版权

由于需要，结合大佬的python代码，做了一个程序，能够把选定目录内所有的pdf文件全部删除到仅剩第一页。

参考代码：(17条消息) python——获取文件夹中所有文件的路径_python 获取所有文件路径_哎呦不错的温jay的博客-CSDN博客

(17条消息) python删除pdf指定页的方法_欧气测不准的博客-CSDN博客

先安装需要的库（在cmd里）

pip install PyPDF2 -i https://pypi.tuna.tsinghua.edu.cn/simple/

python代码：

import os
import PyPDF2

def listdir(path, list_name):  # 传入存储的list
    for file in os.listdir(path):
        file_path = os.path.join(path, file)# 在path后面添加'\file'，形成一个地址
        if os.path.isdir(file_path):#如果file_path是目录
            continue # listdir(file_path, list_name)
        else:
            list_name.append(file_path)# 把file_path添加在list后面
 


def delete_pdf_page(input_path, output_path, page_number):# 输入PDF文件路径,输出PDF文件路径,要保留的页码（从0开始）
    with open(input_path, 'rb') as input_file:# 打开输入路径下的文件，命名为input_file
        reader = PyPDF2.PdfReader(input_file)# 打开的文件用pdf打开，命名为reader
        writer = PyPDF2.PdfWriter()

        num_pages = len(reader.pages)# 页码总数

        if page_number < 0 or page_number >= num_pages:
            print(f"Invalid page number. The PDF file has {num_pages} pages.")
            return

        for current_page in range(num_pages):# 在页码总数里循环当前页码
            if current_page == page_number:# 如果当前页码是要保留的页码
                page = reader.pages[current_page]# 就把当前页码定义为page
                writer.add_page(page)# 把page显示出来

        with open(output_path, 'wb') as output_file:
            writer.write(output_file)# 按照目录输出

        print(f"Page {page_number + 1} deleted successfully.")

# 开始实施
list_name=[]
path='D:/Mr.s/外文文献1'   #文件夹路径
listdir(path,list_name)# 将path内的非目录文件的地址，存入list_name中
page_number_to_delete = 0  # 要保留的页码（从0开始）

for current_path in list_name:
    delete_pdf_page(current_path, current_path, page_number_to_delete)

其中，需要注意，输入文件夹地址时候，分隔符是左斜线 /。

后续完善：

处理某些文件夹时遇到问题：有些待处理的某些PDF文件上，可能使用了AES加密算法，而目前环境中缺少处理AES加密所需的库 PyCryptodome，因此需要先在cmd中安装 PyCryptodome 库：

pip install pycryptodome

之后对代码进行完善如下：

import os
import PyPDF2
from PyPDF2.errors import PdfReadError

def listdir(path, list_name):  # 传入存储的list
    for file in os.listdir(path):
        file_path = os.path.join(path, file)# 在path后面添加'\file'，形成一个地址
        if os.path.isdir(file_path):#如果file_path是目录
            continue # listdir(file_path, list_name)
        else:
            list_name.append(file_path)# 把file_path添加在list后面

def delete_pdf_page(input_path, output_path, page_number):# 输入PDF文件路径,输出PDF文件路径,要保留的页码（从0开始）
    try:
        with open(input_path, 'rb') as input_file:# 打开输入路径下的文件，命名为input_file
            reader = PyPDF2.PdfReader(input_file)# 打开的文件用pdf打开，命名为reader
            
            if reader.is_encrypted:
                try:
                    reader.decrypt('')
                except Exception as e:
                    print(f"Failed to decrypt {input_path}: {e}")
                    return
            
            writer = PyPDF2.PdfWriter()
            num_pages = len(reader.pages)# 页码总数

            if page_number < 0 or page_number >= num_pages:
                print(f"Invalid page number. The PDF file has {num_pages} pages.")
                return

            for current_page in range(num_pages):# 在页码总数里循环当前页码
                if current_page == page_number:# 如果当前页码是要保留的页码
                    page = reader.pages[current_page]# 就把当前页码定义为page
                    writer.add_page(page)# 把page显示出来

            with open(output_path, 'wb') as output_file:
                writer.write(output_file)# 按照目录输出

            print(f"Page {page_number + 1} deleted successfully.")
    
    except PdfReadError as e:
        print(f"Error reading {input_path}: {e}")

# 开始实施
list_name=[]
path='D:/Mr.s/需要仅保留第一页'   #文件夹路径
listdir(path,list_name)# 将path内的非目录文件的地址，存入list_name中
page_number_to_delete = 0  # 要保留的页码（从0开始）

for current_path in list_name:
    delete_pdf_page(current_path, current_path, page_number_to_delete)