- 批量计算文件夹下面所有pdf的页数,并生成一个excel文件,把pdf文件名字和页数都写入到pdf文件。
代码:
import os
import openpyxl
from PyPDF2 import PdfFileReader
def extract_number(filename):
"""从文件名中提取数字"""
start = None
end = None
for i, char in enumerate(filename):
if char.isdigit() and start is None:
start = i
elif not char.isdigit() and start is not None and end is None:
end = i
break
if start is not None:
if end is None:
return int(filename[start:])
else:
return int(filename[start:end])
else:
return None
folder_path = r'/Users/Materials/资料/office相关/用python实现批量计算pdf页数, 批量重命名和批量pdf转图片/python图片转pdf'
excel_path = folder_path + '/pdf_info.xlsx'
pdf_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.pdf')]
# pdf_paths = sorted(pdf_paths, key=lambda x: extract_number(os.path.basename(x))) # 按文件名中数字的大小排序
page_counts = []
pdf_names = []
for pdf_path in pdf_paths:
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] # 获取 PDF 文件名,并去掉文件格式后缀
pdf_names.append(pdf_name) # 将 PDF 文件名添加到列表中
with open(pdf_path, 'rb') as f:
pdf = PdfFileReader(f)
page_count = pdf.getNumPages()
page_counts.append(page_count)
workbook = openpyxl.Workbook()
worksheet = workbook.active
worksheet.title = '页面计算'
worksheet['A1'] = 'PDF名字' # 将列标题改为 'PDF Name'
worksheet['B1'] = '新文件名字'
worksheet['C1'] = '页数'
for i in range(len(pdf_paths)):
worksheet.cell(row=i + 2, column=1, value=pdf_names[i]) # 将 PDF 文件名添加到表格中
worksheet.cell(row=i + 2, column=3, value=page_counts[i])
workbook.save(excel_path)
- 根据excel中新文件名字,进行批量重命名
代码:
import os
import pandas as pd
# 读取Excel文件
df = pd.read_excel('pdf_info.xlsx')
pdf_dict = {}
# 遍历每一行
for i, row in df.iterrows():
# 获取PDF文件名和编号
pdf_name = row['PDF名字']
num = row['新文件名字']
# 添加到字典中
pdf_dict[str(pdf_name)+'.pdf'] = str(num)
# 遍历当前文件夹下所有pdf文件,重命名文件
for file in os.listdir('.'):
if file.endswith('.pdf') and file in pdf_dict:
# 获取文件对应的编号
num = pdf_dict[file]
# 重命名文件
os.rename(os.path.join('.', file), os.path.join('.', str(num) + '.pdf'))
- 批量把pdf文件转化成图片,并创建对应的文件夹,图片的命名规则是8位数,前面有0填充。
代码:
# 批量把pdf文件转化成jpg
# Python 3.7.13
# PyPDF2 2.12.1
import os
from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader
from PIL import Image
# 设置图片最大的像素
Image.MAX_IMAGE_PIXELS = 2300000000
# 必须采用绝对路径,不然会报错
folder_path = r'/Users/Materials/资料/office相关/用python实现批量计算pdf页数, 批量重命名和批量pdf转图片/python图片转pdf'
for filename in os.listdir(folder_path):
if filename.endswith(".pdf"):
pdf_path = os.path.join(folder_path, filename)
input_pdf = PdfFileReader(open(pdf_path, "rb"))
# 获得文件的最大页数
max_pages = input_pdf.numPages
# 对每一个pdf文件创建一个文件夹
new_folder_path = os.path.join(folder_path, f"{filename[:-4]}")
os.mkdir(new_folder_path)
current_page = 0
# 每10页导出的一次,防止报内存限制
for page in range(1, max_pages + 1, 10):
images = convert_from_path(pdf_path, dpi=200, first_page=page, last_page=min(page + 10 - 1, max_pages))
# save each image in the new directory
for i, image in enumerate(images):
# 对图片进行命名,编号的顺序8位数,其余用0填充,e.g. 00000001。
current_page = str(i + page).zfill(8)
image.save(os.path.join(new_folder_path, str(current_page) + ".jpg"), "JPEG")
print("file_name:" + filename + ", max_pages:" + str(max_pages) + ", all_pages:" + current_page)