目录
前言
因日常需要将PDF转化为图片并插入到文档中,我于是在网上的AI工具帮助下实现了将PDF转化为图片的功能。
主要功能
1.提取嵌入PDF中的图片。
2.将PDF的每一页或指定页转化为图片。
可以同时拖入多个PDF文件到文件路径输入框中。
保存规则:默认先在PDF文件所在的文件夹中创建一个与PDF文件名相同的文件夹,输出图片保存到该文件夹中。
核心代码分析
一个多线程函数,防止在处理PDF时窗口卡死
def thread_it(func, *args):
t = threading.Thread(target=func, args=args)
# 守护线程
t.setDaemon = True
# 启动线程
t.start()
bt4 = tk.Button(frame4, text='提取pdf中的图片', command=lambda: thread_it(self.start))
bt4.grid(row=0, column=2, sticky='nsew')
bt5 = tk.Button(frame4, text='将pdf转换为图片', command=lambda: thread_it(self.pdf_to_image))
bt5.grid(row=0, column=3, sticky='nsew')
程序界面,需要传入一个主窗口
def __init__(self, master):
'''
获取pdf中的图片
:param master:
'''
self.master = master
self.files = []
frame1 = tk.Frame(master, relief="raised", borderwidth=1)
bt0 = tk.Label(frame1, text='文件路径')
bt0.grid(row=0, column=0, sticky="nsew")
self.show_path = tk.Entry(frame1)
self.show_path.grid(row=0, column=1, sticky="nsew")
windnd.hook_dropfiles(self.show_path, self.get_paths, force_unicode=True)
bt1 = tk.Button(frame1, text='选择文件', command=self.choose_files)
bt1.grid(row=0, column=2, sticky="nsew")
bt2 = tk.Label(frame1, text='保存路径')
bt2.grid(row=1, column=0, sticky="nsew")
self.save_path = tk.Entry(frame1)
self.save_path.grid(row=1, column=1, sticky="nsew")
bt3 = tk.Button(frame1, text='选择另存路径', command=self.choose_savepath)
bt3.grid(row=1, column=2, sticky="nsew")
frame4 = tk.Frame(master, relief="raised", borderwidth=1)
frame4.columnconfigure(0, weight=0)
frame4.columnconfigure(1, weight=0)
frame4.columnconfigure(2, weight=1)
frame4.columnconfigure(3, weight=1)
frame4.rowconfigure(0, weight=0)
l0 = tk.Label(frame4, text='提取范围')
l0.grid(row=0, column=0, sticky='nsew')
self.page_input = tk.Entry(frame4, relief="raised", borderwidth=2)
self.page_input.grid(row=0, column=1, sticky='nsew')
bt4 = tk.Button(frame4, text='提取pdf中的图片', command=lambda: thread_it(self.start))
bt4.grid(row=0, column=2, sticky='nsew')
bt5 = tk.Button(frame4, text='将pdf转换为图片', command=lambda: thread_it(self.pdf_to_image))
bt5.grid(row=0, column=3, sticky='nsew')
frame2 = tk.Frame(master)
frame2.columnconfigure(0, weight=0)
frame2.columnconfigure(1, weight=1)
frame2.columnconfigure(2, weight=1)
frame2.rowconfigure(0, weight=0)
bt6 = tk.Label(frame2, text='导出图片配置:')
bt6.grid(row=0, column=0, sticky='nsew')
self.resolution_value = tk.IntVar()
self.zoom_factor_value = tk.IntVar()
resolution_slider = tk.Scale(frame2, from_=200, to=800, resolution=200, orient='horizontal', label='分辨率',
variable=self.resolution_value)
resolution_slider.set(400) # 设置默认值为400
resolution_slider.grid(row=0, column=1, sticky='nsew')
zoom_factor_slider = tk.Scale(frame2, from_=1, to=4, resolution=1, orient='horizontal', label='缩放因子',
variable=self.zoom_factor_value)
zoom_factor_slider.set(2) # 设置默认值为2
zoom_factor_slider.grid(row=0, column=2, sticky='nsew')
frame3 = tk.Frame(master, relief="raised", borderwidth=1)
self.out = tk.scrolledtext.ScrolledText(frame3)
self.out.pack(fill=tk.BOTH, expand=True)
frame1.rowconfigure(0, weight=0)
frame1.rowconfigure(1, weight=0)
frame1.columnconfigure(1, weight=1)
frame3.rowconfigure(0, weight=1)
frame3.columnconfigure(0, weight=1)
frame1.grid(row=0, column=0, sticky='nsew')
frame4.grid(row=1, column=0, sticky='nsew')
frame2.grid(row=2, column=0, sticky='nsew')
frame3.grid(row=3, column=0, sticky='nsew')
master.columnconfigure(0, weight=1)
master.rowconfigure(0, weight=0)
master.rowconfigure(1, weight=0)
master.rowconfigure(2, weight=0)
master.rowconfigure(3, weight=1)
# 创建Entry组件并设置注释文本
self.save_path.insert(0, "默认保存在原文件夹")
self.save_path.config(fg='gray') # 设置注释文本颜色为灰色
self.save_path.bind('<FocusIn>', self.on_entry_click) # 绑定事件处理程序
self.page_input.insert(0, "默认提取全部")
self.page_input.config(fg='gray') # 设置注释文本颜色为灰色
self.page_input.bind('<FocusIn>', self.page_input_on_entry_click) # 绑定事件处理程序
self.page_input.bind('<FocusOut>', self.page_input_on_focus_out) # 绑定事件处理程序
这是将页面数据转化为图片,核心
def to_be_img(self, pdf_page_number, input_pdf_page, img_path, resolution, zoom_factor):
'''
:param pdf_page_number: 指定的页码
:param input_pdf_page: 输入指定页码的页面数据
:param img_path: 图片保存的路径
:param resolution: 分辨率
:param zoom_factor: 放大系数
:return:
'''
zoom_matrix = fitz.Matrix(zoom_factor, zoom_factor) # 调整矩阵以改善图像质量
pix = input_pdf_page.get_pixmap(matrix=zoom_matrix)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img = img.resize((int(pix.width * resolution / 72), int(pix.height * resolution / 72)),
resample=Image.LANCZOS) # 调整图像分辨率
img.save(img_path, "PNG", dpi=(resolution, resolution)) # 保存图像为PNG格式并设置分辨率
self.out.see(tk.END)
self.out.insert('insert', f'已处理第{pdf_page_number}页。\n')
有三种选择:提取全部;提取某一页;提取某一范围
def extract_images_from_pdf(self, pdf_path, output_folder, pages):
resolution = self.resolution_value.get()
zoom_factor = self.zoom_factor_value.get()
doc = fitz.open(pdf_path)
if pages['end'] == None: # 提取全部
for page_number in range(doc.page_count):
page = doc.load_page(page_number)
image_list = page.get_images(full=True)
error_nb = 0
for img_index, img_info in enumerate(image_list):
try:
xref = img_info[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
resample=Image.LANCZOS)
image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
dpi=(resolution, resolution), quality=95)
except Exception as e:
error_nb += 1
self.out.see(tk.END)
self.out.insert('insert', f'出错:{e}\n')
self.out.see(tk.END)
self.out.insert('insert',
f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
self.out.see(tk.END)
doc.close()
elif pages['end'] == 0: # 提取某一页
page_nb = pages['start'] - 1
page = doc.load_page(page_nb)
image_list = page.get_images(full=True)
error_nb = 0
for img_index, img_info in enumerate(image_list):
try:
xref = img_info[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
resample=Image.LANCZOS)
image.save(f"{output_folder}/page{pages['start']}_image{img_index + 1}.png", format="PNG",
dpi=(resolution, resolution), quality=95)
except Exception as e:
error_nb += 1
self.out.see(tk.END)
self.out.insert('insert', f'出错:{e}\n')
self.out.see(tk.END)
self.out.insert('insert',
f'第{pages["start"]}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
self.out.see(tk.END)
doc.close()
else: # 提取某范围
page_start = pages['start'] - 1
page_end = pages['end']
for page_number in range(page_start, page_end):
page = doc.load_page(page_number)
image_list = page.get_images(full=True)
error_nb = 0
for img_index, img_info in enumerate(image_list):
try:
xref = img_info[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
resample=Image.LANCZOS)
image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
dpi=(resolution, resolution), quality=95)
except Exception as e:
error_nb += 1
self.out.see(tk.END)
self.out.insert('insert', f'出错:{e}\n')
self.out.see(tk.END)
self.out.insert('insert',
f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
self.out.see(tk.END)
doc.close()
self.out.see(tk.END)
提取界面中相关参数并开始运行的函数
def start(self):
'''
提取pdf中的图片
:return:
'''
# 是否可以开始转化的标志
IS_OK_TO_WORK = True
# 存储提取页码,当下默认提取全部,当pages['end']=='0'为提取某一页
pages = {'start': 1, 'end': None}
if self.page_input.get() != '默认提取全部' and self.page_input.get() != '输入提取页码,如5-12' and self.page_input.get() != '':
tmp = self.page_input.get().split('-')
if len(tmp) == 2:
try:
pages['start'] = int(tmp[0])
pages['end'] = int(tmp[1])
except Exception as e:
IS_OK_TO_WORK = False
print(e)
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
elif len(tmp) == 1:
try:
pages['start'] = int(tmp[0])
pages['end'] = 0
except:
IS_OK_TO_WORK = False
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
else:
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
IS_OK_TO_WORK = False
else:
pages = {'start': 1, 'end': None}
if IS_OK_TO_WORK:
for pdf_path in self.files:
# 提取PDF中的图片
pdf_name = os.path.basename(pdf_path)
tmp = self.save_path.get()
if tmp != "默认保存在原文件夹" and tmp != "":
output_folder = os.path.join(tmp, os.path.splitext(pdf_name)[0])
else:
folder_path = os.path.dirname(pdf_path)
output_folder = os.path.join(folder_path, os.path.splitext(pdf_name)[0])
if not os.path.exists(output_folder):
os.mkdir(output_folder)
self.out.see(tk.END)
self.out.insert('insert', f'正在处理{pdf_path}\n')
self.out.insert('insert', f'图片保存路径为{output_folder}\n')
self.extract_images_from_pdf(pdf_path=pdf_path, output_folder=output_folder, pages=pages)
完整代码
from tkinter import scrolledtext
import tkinter as tk
from tkinter import filedialog
import os
import fitz
import io
import concurrent.futures
import windnd
from tkinter import messagebox
from PIL import Image
import threading
def thread_it(func, *args):
t = threading.Thread(target=func, args=args)
# 守护线程
t.setDaemon = True
# 启动线程
t.start()
class get_pdf_photo():
def __init__(self, master):
'''
获取pdf中的图片
:param master:
'''
self.master = master
self.files = []
frame1 = tk.Frame(master, relief="raised", borderwidth=1)
bt0 = tk.Label(frame1, text='文件路径')
bt0.grid(row=0, column=0, sticky="nsew")
self.show_path = tk.Entry(frame1)
self.show_path.grid(row=0, column=1, sticky="nsew")
windnd.hook_dropfiles(self.show_path, self.get_paths, force_unicode=True)
bt1 = tk.Button(frame1, text='选择文件', command=self.choose_files)
bt1.grid(row=0, column=2, sticky="nsew")
bt2 = tk.Label(frame1, text='保存路径')
bt2.grid(row=1, column=0, sticky="nsew")
self.save_path = tk.Entry(frame1)
self.save_path.grid(row=1, column=1, sticky="nsew")
bt3 = tk.Button(frame1, text='选择另存路径', command=self.choose_savepath)
bt3.grid(row=1, column=2, sticky="nsew")
frame4 = tk.Frame(master, relief="raised", borderwidth=1)
frame4.columnconfigure(0, weight=0)
frame4.columnconfigure(1, weight=0)
frame4.columnconfigure(2, weight=1)
frame4.columnconfigure(3, weight=1)
frame4.rowconfigure(0, weight=0)
l0 = tk.Label(frame4, text='提取范围')
l0.grid(row=0, column=0, sticky='nsew')
self.page_input = tk.Entry(frame4, relief="raised", borderwidth=2)
self.page_input.grid(row=0, column=1, sticky='nsew')
bt4 = tk.Button(frame4, text='提取pdf中的图片', command=lambda: thread_it(self.start))
bt4.grid(row=0, column=2, sticky='nsew')
bt5 = tk.Button(frame4, text='将pdf转换为图片', command=lambda: thread_it(self.pdf_to_image))
bt5.grid(row=0, column=3, sticky='nsew')
frame2 = tk.Frame(master)
frame2.columnconfigure(0, weight=0)
frame2.columnconfigure(1, weight=1)
frame2.columnconfigure(2, weight=1)
frame2.rowconfigure(0, weight=0)
bt6 = tk.Label(frame2, text='导出图片配置:')
bt6.grid(row=0, column=0, sticky='nsew')
self.resolution_value = tk.IntVar()
self.zoom_factor_value = tk.IntVar()
resolution_slider = tk.Scale(frame2, from_=200, to=800, resolution=200, orient='horizontal', label='分辨率',
variable=self.resolution_value)
resolution_slider.set(400) # 设置默认值为400
resolution_slider.grid(row=0, column=1, sticky='nsew')
zoom_factor_slider = tk.Scale(frame2, from_=1, to=4, resolution=1, orient='horizontal', label='缩放因子',
variable=self.zoom_factor_value)
zoom_factor_slider.set(2) # 设置默认值为2
zoom_factor_slider.grid(row=0, column=2, sticky='nsew')
frame3 = tk.Frame(master, relief="raised", borderwidth=1)
self.out = tk.scrolledtext.ScrolledText(frame3)
self.out.pack(fill=tk.BOTH, expand=True)
frame1.rowconfigure(0, weight=0)
frame1.rowconfigure(1, weight=0)
frame1.columnconfigure(1, weight=1)
frame3.rowconfigure(0, weight=1)
frame3.columnconfigure(0, weight=1)
frame1.grid(row=0, column=0, sticky='nsew')
frame4.grid(row=1, column=0, sticky='nsew')
frame2.grid(row=2, column=0, sticky='nsew')
frame3.grid(row=3, column=0, sticky='nsew')
master.columnconfigure(0, weight=1)
master.rowconfigure(0, weight=0)
master.rowconfigure(1, weight=0)
master.rowconfigure(2, weight=0)
master.rowconfigure(3, weight=1)
# 创建Entry组件并设置注释文本
self.save_path.insert(0, "默认保存在原文件夹")
self.save_path.config(fg='gray') # 设置注释文本颜色为灰色
self.save_path.bind('<FocusIn>', self.on_entry_click) # 绑定事件处理程序
self.page_input.insert(0, "默认提取全部")
self.page_input.config(fg='gray') # 设置注释文本颜色为灰色
self.page_input.bind('<FocusIn>', self.page_input_on_entry_click) # 绑定事件处理程序
self.page_input.bind('<FocusOut>', self.page_input_on_focus_out) # 绑定事件处理程序
def page_input_on_focus_out(self, event):
if not self.page_input.get() or self.page_input.get() == '输入提取页码,如5-12':
self.page_input.delete(0, tk.END)
self.page_input.insert(0, "默认提取全部")
self.page_input.config(fg='gray') # 设置注释文本颜色为灰色
def get_paths(self, paths):
'''
拖入获取路径
:param paths:
:return:
'''
self.files = []
self.show_path.delete(0, tk.END) # 删除注释文本
self.show_path.config(fg='black') # 更改文本颜色为黑色
for idx, i in enumerate(paths):
if i.endswith('.pdf'):
self.files.append(i)
self.show_path.insert(0, i + '\n')
else:
messagebox.showinfo(title='Error', message=f'文件{i}不是PDF文件!')
def page_input_on_key(self, event):
self.page_input.delete(0, tk.END) # 删除注释文本
self.page_input.config(fg='black') # 更改文本颜色为黑色
# 解除绑定
self.page_input.unbind("<Key>")
def page_input_on_entry_click(self, event):
if self.page_input.get() == '输入提取页码,如5-12' or self.page_input.get() == '默认提取全部':
self.page_input.delete(0, tk.END) # 删除注释文本
self.page_input.config(fg='gray') # 更改文本颜色为黑色
self.page_input.insert(0, '输入提取页码,如5-12')
self.page_input.icursor(0) # 将光标移到最前面
self.page_input.bind("<Key>", self.page_input_on_key) # 绑定键盘按键事件
def on_entry_click(self, event):
if self.save_path.get() == "默认保存在原文件夹":
self.save_path.delete(0, tk.END) # 删除注释文本
self.save_path.config(fg='black') # 更改文本颜色为黑色
def choose_savepath(self):
path = filedialog.askdirectory()
if path != '':
self.save_path.delete(0, tk.END)
self.save_path.insert('insert', path)
self.save_path.config(fg='black') # 更改文本颜色为黑色
def choose_files(self):
root = tk.Tk()
root.withdraw()
# 弹出文件对话框,让用户选择一个或多个PDF文件
self.files = filedialog.askopenfilenames(filetypes=(('PDF Files', '*.pdf'),))
self.show_path.delete(0, tk.END)
for pdf_path in self.files:
self.show_path.insert('insert', pdf_path)
def to_be_img(self, pdf_page_number, input_pdf_page, img_path, resolution, zoom_factor):
'''
:param pdf_page_number: 指定的页码
:param input_pdf_page: 输入指定页码的页面数据
:param img_path: 图片保存的路径
:param resolution: 分辨率
:param zoom_factor: 放大系数
:return:
'''
zoom_matrix = fitz.Matrix(zoom_factor, zoom_factor) # 调整矩阵以改善图像质量
pix = input_pdf_page.get_pixmap(matrix=zoom_matrix)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img = img.resize((int(pix.width * resolution / 72), int(pix.height * resolution / 72)),
resample=Image.LANCZOS) # 调整图像分辨率
img.save(img_path, "PNG", dpi=(resolution, resolution)) # 保存图像为PNG格式并设置分辨率
self.out.see(tk.END)
self.out.insert('insert', f'已处理第{pdf_page_number}页。\n')
def extract_images_from_pdf(self, pdf_path, output_folder, pages):
resolution = self.resolution_value.get()
zoom_factor = self.zoom_factor_value.get()
doc = fitz.open(pdf_path)
if pages['end'] == None: # 提取全部
for page_number in range(doc.page_count):
page = doc.load_page(page_number)
image_list = page.get_images(full=True)
error_nb = 0
for img_index, img_info in enumerate(image_list):
try:
xref = img_info[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
resample=Image.LANCZOS)
image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
dpi=(resolution, resolution), quality=95)
except Exception as e:
error_nb += 1
self.out.see(tk.END)
self.out.insert('insert', f'出错:{e}\n')
self.out.see(tk.END)
self.out.insert('insert',
f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
self.out.see(tk.END)
doc.close()
elif pages['end'] == 0: # 提取某一页
page_nb = pages['start'] - 1
page = doc.load_page(page_nb)
image_list = page.get_images(full=True)
error_nb = 0
for img_index, img_info in enumerate(image_list):
try:
xref = img_info[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
resample=Image.LANCZOS)
image.save(f"{output_folder}/page{pages['start']}_image{img_index + 1}.png", format="PNG",
dpi=(resolution, resolution), quality=95)
except Exception as e:
error_nb += 1
self.out.see(tk.END)
self.out.insert('insert', f'出错:{e}\n')
self.out.see(tk.END)
self.out.insert('insert',
f'第{pages["start"]}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
self.out.see(tk.END)
doc.close()
else: # 提取某范围
page_start = pages['start'] - 1
page_end = pages['end']
for page_number in range(page_start, page_end):
page = doc.load_page(page_number)
image_list = page.get_images(full=True)
error_nb = 0
for img_index, img_info in enumerate(image_list):
try:
xref = img_info[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image = image.resize((int(image.width * zoom_factor), int(image.height * zoom_factor)),
resample=Image.LANCZOS)
image.save(f"{output_folder}/page{page_number + 1}_image{img_index + 1}.png", format="PNG",
dpi=(resolution, resolution), quality=95)
except Exception as e:
error_nb += 1
self.out.see(tk.END)
self.out.insert('insert', f'出错:{e}\n')
self.out.see(tk.END)
self.out.insert('insert',
f'第{page_number + 1}页提取提取了{len(image_list) - error_nb}张,出错{error_nb}张\n')
self.out.see(tk.END)
doc.close()
self.out.see(tk.END)
def start(self):
'''
提取pdf中的图片
:return:
'''
# 是否可以开始转化的标志
IS_OK_TO_WORK = True
# 存储提取页码,当下默认提取全部,当pages['end']=='0'为提取某一页
pages = {'start': 1, 'end': None}
if self.page_input.get() != '默认提取全部' and self.page_input.get() != '输入提取页码,如5-12' and self.page_input.get() != '':
tmp = self.page_input.get().split('-')
if len(tmp) == 2:
try:
pages['start'] = int(tmp[0])
pages['end'] = int(tmp[1])
except Exception as e:
IS_OK_TO_WORK = False
print(e)
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
elif len(tmp) == 1:
try:
pages['start'] = int(tmp[0])
pages['end'] = 0
except:
IS_OK_TO_WORK = False
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
else:
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
IS_OK_TO_WORK = False
else:
pages = {'start': 1, 'end': None}
if IS_OK_TO_WORK:
for pdf_path in self.files:
# 提取PDF中的图片
pdf_name = os.path.basename(pdf_path)
tmp = self.save_path.get()
if tmp != "默认保存在原文件夹" and tmp != "":
output_folder = os.path.join(tmp, os.path.splitext(pdf_name)[0])
else:
folder_path = os.path.dirname(pdf_path)
output_folder = os.path.join(folder_path, os.path.splitext(pdf_name)[0])
if not os.path.exists(output_folder):
os.mkdir(output_folder)
self.out.see(tk.END)
self.out.insert('insert', f'正在处理{pdf_path}\n')
self.out.insert('insert', f'图片保存路径为{output_folder}\n')
self.extract_images_from_pdf(pdf_path=pdf_path, output_folder=output_folder, pages=pages)
def pdf_to_image(self):
'''
将pdf转化为图片
:return:
'''
IS_OK_TO_WORK = True
# 存储提取页码,当下默认提取全部,当pages['end']=='0'为提取某一页
pages = {'start': 1, 'end': None}
if self.page_input.get() != '默认提取全部' and self.page_input.get() != '输入提取页码,如5-12' and self.page_input.get() != '':
tmp = self.page_input.get().split('-')
if len(tmp) == 2:
try:
pages['start'] = int(tmp[0])
pages['end'] = int(tmp[1])
if pages['start'] > pages['end']:
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
IS_OK_TO_WORK = False
except Exception as e:
IS_OK_TO_WORK = False
print(e)
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
elif len(tmp) == 1:
try:
pages['start'] = int(tmp[0])
pages['end'] = 0
except:
IS_OK_TO_WORK = False
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
else:
messagebox.showerror(message='提取页码输入不给规范,请检查,样例:2-5(提取第2页到第5页的页面)')
IS_OK_TO_WORK = False
else:
pages = {'start': 1, 'end': None}
if IS_OK_TO_WORK:
# resolution=300:代表设置图像的分辨率为 300 DPI(每英寸点数),这可以提高图像的清晰度。
# zoom_factor=2:代表将页面放大两倍以提高图像的清晰度。
resolution = self.resolution_value.get()
zoom_factor = self.zoom_factor_value.get()
for pdf_path in self.files:
mypdf = fitz.open(pdf_path)
self.out.see(tk.END)
self.out.insert('insert', f'开始转换{pdf_path}\n')
tmp = os.path.splitext(pdf_path)[0]
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
tmp_path = self.save_path.get()
if tmp_path and tmp_path != '默认保存在原文件夹':
save_path = os.path.join(tmp_path, f'{pdf_name}')
else:
save_path = tmp
if not os.path.exists(save_path):
os.makedirs(save_path)
if pages['end'] == None:
futures = []
# 创建一个最多包含20个线程的线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
for pdf_page_number in range(mypdf.page_count):
input_pdf_page = mypdf.load_page(pdf_page_number)
img_path = os.path.join(save_path, f'{pdf_page_number + 1}.png')
args_list = [pdf_page_number + 1, input_pdf_page, img_path, resolution, zoom_factor]
# 提交任务给线程池
future = executor.submit(self.to_be_img, *args_list)
futures.append(future)
mypdf.close()
elif pages['end'] == 0:
page_nb = pages['start'] - 1
img_path = os.path.join(save_path, f'{pages["start"]}.png')
page = mypdf.load_page(page_nb)
self.to_be_img(pdf_page_number=pages['start'], input_pdf_page=page, img_path=img_path,
resolution=resolution, zoom_factor=zoom_factor)
mypdf.close()
else:
page_start = pages['start'] - 1
page_end = pages['end']
futures = []
# 创建一个最多包含20个线程的线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
for pdf_page_number in range(page_start, page_end):
input_pdf_page = mypdf.load_page(pdf_page_number)
img_path = os.path.join(save_path, f'{pdf_page_number + 1}.png')
args_list = [pdf_page_number + 1, input_pdf_page, img_path, resolution, zoom_factor]
# 提交任务给线程池
future = executor.submit(self.to_be_img, *args_list)
futures.append(future)
mypdf.close()
self.out.see(tk.END)
self.out.insert('insert', f'转换任务完成\n\n')
root = tk.Tk()
root.title('提取PDF中的图片')
get_pdf_photo(root)
root.mainloop()
总结
我想要的功能已实现,日常够用了。