扫描件PDF基本上都是图片,是无法复制内容的,那么怎么才能复制到内容呢,有人说很多软件都是可以的,但是基本上所有的的软件,都需要购买VIP的,有没有免费的呢?
有!我发现WPS里面“转换”——“扫描件识别”,可以转换PDF,而且识别准确性非常高,但是,每个pdf最多只能识别5页,所以,我们需要将需要扫描的原始pdf每隔5页分割成多个pdf,再进行转换,之后再进行合并,就OK了。
网上搜了一下,几乎所有的pdf拆分合并的软件都是要钱的,那么我们可以通过python,自己写一个程序,进行pdf的拆分和合并,完全免费,随意使用。
下面是pdf拆分的代码,桌面程序
import tkinter as tk
from tkinter import filedialog as fd
from PyPDF2 import PdfReader, PdfWriter
# 获取pdf文件总页数
def pdf_page_num(path):
file_reader = PdfReader(path)
return len(file_reader.pages)
# 打开选择文件对话框选择文件
def select_pdf():
pdf_selected = fd.askopenfilename(filetypes=[('PDF文件', '.pdf')])
if pdf_selected != '':
pdf_in.set(pdf_selected)
page_num = pdf_page_num(pdf_selected)
page_range.set("页码范围:1-%s" % page_num)
button_out['state'] = "normal"
# 打开选择文件夹对话框选择保存位置
def select_out():
path_save = fd.askdirectory()
if path_save != '':
button_split['state'] = 'normal'
pdf_out.set(path_save)
def pdf_split():
select_file_path = pdf_in.get()
save_path = pdf_out.get()
page_text = entry_page.get()
if select_file_path is None or select_file_path == '':
tk.messagebox.showinfo('提醒', "请选择需要分割的文件!!!")
return
if save_path is None or save_path == '':
tk.messagebox.showinfo('提醒', "请选择保存的文件夹!!!")
return
if page_text is None or page_text == '':
tk.messagebox.showinfo('提醒', "请输入分割的页码!!!")
return
file_reader = PdfReader(select_file_path)
pageNums = []
if page_text.find('/') == 0:
pageNum = int(page_text.replace("/", ""))
firstNum = 0
totalNum = len(file_reader.pages)
for num in range(1, totalNum + 1):
if num % pageNum == 1:
firstNum = num
if num % pageNum == 0 or num == totalNum:
pageNums.append((firstNum, num))
else:
pageArr = page_text.split(',')
for pageText in pageArr:
if "-" in pageText:
pageTextSplit = pageText.split("-")
pageNums.append((int(pageTextSplit[0]), int(pageTextSplit[1])))
else:
pageNums.append((int(pageText), int(pageText)))
for pageNum in pageNums:
file_writer = PdfWriter()
firstNum = pageNum[0]
lastNum = pageNum[1]
for pageIndex in range(firstNum, lastNum + 1):
file_writer.add_page(file_reader.pages[pageIndex - 1])
name = firstNum if firstNum == lastNum else "%s-%s" % (firstNum, lastNum)
with open(r"%s\%s.pdf" % (save_path, name), 'wb') as out:
file_writer.write(out)
tk.messagebox.showinfo('成功', "全部拆分成功!!!")
def select_split_file(root):
global button_split, pdf_in, pdf_out, split_out, page_range, button_out, split_result, entry_page
pdf_in = tk.StringVar()
pdf_out = tk.StringVar()
split_out = tk.StringVar()
page_range = tk.StringVar()
split_result = tk.StringVar()
label_input = tk.Label(root, text="1、选择要分割的PDF文件")
entry_input = tk.Entry(root, textvariable=pdf_in, width=65)
button_input = tk.Button(root, text="选择文件", command=select_pdf)
label_page = tk.Label(root, text="2、设置分割的页码")
label_page_des = tk.Label(root, text="(可分割为多个PDF,用逗号分隔。例如,1-5,2-10,11,12,如果需要间隔相同页码分割,比如每隔5页,/5)")
label_page_range = tk.Label(root, text="页码范围", textvariable=page_range)
entry_page = tk.Entry(root, textvariable=split_out, width=65)
label_out = tk.Label(root, text='3、选择输出文件夹:')
entry_out = tk.Entry(root, textvariable=pdf_out, width=65)
button_out = tk.Button(root, text='选择保存位置', command=select_out)
button_out['state'] = 'disabled'
button_split = tk.Button(root, text='执行分割', command=pdf_split, width=20, height=2)
label_input.place(x=10, y=10)
entry_input.place(x=10, y=35)
button_input.place(x=500, y=32)
label_page.place(x=10, y=80)
label_page_des.place(x=10, y=105)
label_page_range.place(x=10, y=130)
entry_page.place(x=10, y=155)
label_out.place(x=10, y=205)
entry_out.place(x=10, y=230)
button_out.place(x=500, y=230)
button_split.place(x=220, y=260)
root = tk.Tk()
sw = root.winfo_screenwidth()
sh = root.winfo_screenheight()
c = (sw - 600) / 2
d = (sh - 500) / 2
root.geometry('600x500+%d+%d' % (c, d))
root.title("PDF拆分")
root.resizable(width=False, height=False)
select_split_file(root)
root.mainloop()
下面是pdf合并的程序 ,桌面程序
import tkinter as tk
from tkinter import filedialog as fd
from PyPDF2 import PdfReader, PdfWriter
import os
import natsort
file_types = [('PDF文件', '.pdf')]
def pdf_reader(path):
# 列出目录下的所有文件和文件夹
files = os.listdir(path)
file_list = natsort.natsorted(files)
return file_list
def pdf_reader_path(path):
# 列出目录下的所有文件和文件夹
files = os.listdir(path)
file_list = natsort.natsorted(files)
paths = []
for file_name in file_list:
print(file_name)
# 拼接目录名和文件名
file_path = os.path.join(path, file_name)
# 判断是否为文件
if os.path.isfile(file_path):
paths.append(file_path)
return paths
# 选择保存位置
def select_in():
path_in = fd.askdirectory()
if path_in != '':
button_out['state'] = 'normal'
pdf_in.set(path_in)
pdfs = pdf_reader(path_in)
text_names.insert(tk.END, ",".join(pdfs))
# 选择保存位置
def select_out():
path_save = fd.askdirectory()
if path_save != '':
pdf_out.set(path_save)
def pdf_split():
path_in = pdf_in.get()
save_path = pdf_out.get()
if path_in is None or path_in == '':
tk.messagebox.showinfo('提醒', "请选择需要合并的文件所在文件夹!!!")
return
if save_path is None or save_path == '':
tk.messagebox.showinfo('提醒', "请选择保存的文件夹!!!")
return
paths = pdf_reader_path(path_in)
file_writer = PdfWriter()
for file in paths:
print(file)
# 循环读取需要合并pdf文件
file_reader = PdfReader(file)
# 遍历每个pdf的每一页
for page in range(len(file_reader.pages)):
# 写入实例化对象中
file_writer.add_page(file_reader.pages[page])
with open(r"%s\合成后文件.pdf" % (save_path), 'wb') as out:
file_writer.write(out)
tk.messagebox.showinfo('成功', "全部合成成功!!!")
def select_split_file(root):
global button_split, pdf_in, pdf_out, pdf_names, button_out, split_result, entry_page, text_names
pdf_in = tk.StringVar()
pdf_out = tk.StringVar()
pdf_names = tk.StringVar()
split_result = tk.StringVar()
label_input = tk.Label(root, text='1、选择需要合并的PDF文件所在的文件夹:')
entry_input = tk.Entry(root, textvariable=pdf_in, width=60)
button_input = tk.Button(root, text='选择合并文件夹', command=select_in)
label_out = tk.Label(root, text='2、选择输出文件夹:')
entry_out = tk.Entry(root, textvariable=pdf_out, width=60)
button_out = tk.Button(root, text='选择保存位置', command=select_out)
button_out['state'] = 'disabled'
button_split = tk.Button(root, text='执行合并', command=pdf_split, width=20, height=2)
text_names = tk.Text(root, height=20, width=79)
text_names.pack(side='left', fill=tk.BOTH, expand=True)
scrollbar = tk.Scrollbar(root, command=text_names.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
text_names.config(yscrollcommand=scrollbar.set)
label_input.place(x=10, y=10)
entry_input.place(x=10, y=35)
button_input.place(x=480, y=32)
label_out.place(x=10, y=80)
entry_out.place(x=10, y=105)
button_out.place(x=480, y=105)
button_split.place(x=220, y=160)
text_names.place(x=10, y=230)
root = tk.Tk()
sw = root.winfo_screenwidth()
sh = root.winfo_screenheight()
c = (sw - 600) / 2
d = (sh - 500) / 2
root.geometry('600x500+%d+%d' % (c, d))
root.title("PDF合并")
root.resizable(width=False, height=False)
select_split_file(root)
root.mainloop()
其次,如果原始文件含有书签,那么可以将原始文件的书签复制到新的合并的pdf,控制台程序,可以定位到原书签相同的目标位置,不会出现同一页多个书签,点击不动的情况
import PyPDF2 as pdf
from PyPDF2.generic import Fit
def add_bookmark(writer, bookmark_list, realPage, parent):
for bookmark in bookmark_list:
if isinstance(bookmark, list):
add_bookmark(writer, bookmark, realPage, parentItem)
else:
title = bookmark.title
# bookmark.page.idnum 不是真正的页码,需要找到真正的页码
pageIndex = realPage[bookmark.page.idnum]
parentItem = writer.add_outline_item(title, pageIndex, parent,
fit=Fit("/XYZ", (bookmark.left, bookmark.top, bookmark.zoom)))
print(title)
if __name__ == "__main__":
reader = pdf.PdfReader(R"D:\xxx.pdf")
# 下面代码是为了找到真正的页码与idnum对应关系
realPage = {}
for pageIndex in range(len(reader.pages)):
page = reader.pages[pageIndex]
realPage[page.indirect_ref.idnum] = pageIndex
reader2 = pdf.PdfReader(R"D:\xxx(OCR).pdf")
writer2 = pdf.PdfWriter()
for i in range(len(reader2.pages)):
writer2.add_page(reader2.pages[i])
bookmark_out = reader.outline
add_bookmark(writer2, bookmark_out, realPage, None)
writer2.write(R"D:\xxx(OCR-书签).pdf")
如果不想执行代码,也可以使用生成好的exe执行
链接:https://pan.baidu.com/s/15Ks2pTFmoIrXcZ-gpP9e2A
提取码:6dza