目录
相关库
- 粒度为文字则使用
pdfplumber
- 粒度为文件则使用
PyPDF2
安装
pip install pdfplumber
pip install pymupdf
pdfplumber
文字信息提取
- 利用pdfplumber打开一个 PDF 文件
import pdfplumber
with pdfplumber.open("./mongodb_base.pdf") as f:
- 获取指定的页,或者遍历每一页
print(f.pages)
-----------------------
[<Page:1>, <Page:2>, <Page:3>, <Page:4>, <Page:5>, <Page:6>]
- 利用.extract_text()方法提取当前页的文字
with pdfplumber.open("./mongodb_base.pdf") as f:
page = f.pages[0]
print(page.extract_text())
----------------------------------------------
MongoDB快速上手
表格信息提取
.extract_table()
提取指定页面的第一个表格
.extract_tables()
提取指定页面多个表格
PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
# PdfFileReader 读取器
# PdfFileWriter 写入器
读取器只能将读取的内容一页一页交给写入器
合并pdf
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_writer = PdfFileWriter()
for i in range(1, 6):
pdf_reader = PdfFileReader(f"./pdf{i}.pdf")
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
# 写入pdf
with open("./merge.pdf", 'wb') as out:
pdf_writer.write(out)
拆分pdf
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_reader = PdfFileReader("./test.pdf")
for page in range(pdf_reader.getNumPages()):
# 遍历到每一页挨个生成写入器
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf_reader.getPage(page))
# 写入器被添加一页后立即输出产生pdf
with open(f"./pdf{page}.pdf", 'wb') as out:
pdf_writer.write(out)
加密pdf
- 将页写入读取器
- pdf_writer.encrypt(密码)
- 写入pdf
旋转pdf
-
page.rotateClockwise(90的倍数):顺时针旋转90度
-
page.rotateCounterClockwise(90的倍数):逆时针旋转90度
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_reader = PdfFileReader(r"xxx.pdf")
# 新建一个writer
pdf_writer = PdfFileWriter()
for page in range(pdf_reader.getNumPages()):
if page % 2 == 0:
rotation_page = pdf_reader.getPage(page).rotateCounterClockwise(90)
else:
rotation_page = pdf_reader.getPage(page).rotateClockwise(90)
pdf_writer.addPage(rotation_page)
# 结果输出
with open("xxx2.pdf", "wb") as out:
pdf_writer.write(out)
倒序pdf
from PyPDF2 import PdfFileReader, PdfFileWriter
# 打开pdf
pdf_reader = PdfFileReader(r"xxx.pdf")
pdf_writer = PdfFileWriter()
# 倒序遍历range步长为-1即为倒序
for page in range(pdf_reader.getNumPages()-1, -1, -1):
pdf_writer.addPage(pdf_reader.getPage(page))
# 写入结果
with open("xxx2.pdf", "wb") as out:
pdf_writer.write(out)
加水印
加水印本质上就是把水印PDF页和需要加水印的页合并
- 准备水印pdf
- 使用自带的copy包复制水印:
from copy import copy
# 读取水印
water_reader = PdfFileReader("./water.pdf")
water = water_reader.getPage(0) # 第一页是水印页
# 读取待加水印的pdf
reader = PdfFileReader("./dst.pdf")
# 新建pdf
writer = PdfFileWriter()
for page in range(reader.getNumPages()):
# 获取每一页
p = reader.getPage(page)
# 必须取水印的副本
tmp = copy(water)
# 合并
newPage = tmp.mergePage(p)
# 放到pdf
writer.addPage(newPage)
# 写入pdf
with open("加了水印.pdf", "wb") as out:
writer.write(out)
- 由于需要加水印的PDF可能有很多页,而水印PDF只有一页,因此如果直接把水印PDF拿来合并,水印PDF页就没有了。
- 因此不能直接拿来合并,而要把水印PDF页不断copy出来成新的一页备用new_page,再运用.mergePage方法完成跟每一页合并,把合并后的页交给写入器待最后统一输出。
- 进行pdf合并的时候,“水印”在下面,文字在上面,因此是“水印”.mergePage(“图片页”)
pdf添加密码
添加密码:
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_reader = PdfFileReader(r"xxx.pdf")
pdf_writer = PdfFileWriter()
# 读取出来写到writer
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
# 添加密码
pdf_writer.encrypt("123456")
# 保存文件
with open("result.pdf", "wb") as out:
pdf_writer.write(out)
输入密码:
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_reader = PdfFileReader(r"xxx.pdf")
# 解密pdf
pdf_reader.decrypt("123456")
# writer
pdf_writer = PdfFileWriter()
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
# 解密好的pdf写出
with open("xxx2.pdf", "wb") as out:
pdf_writer.write(out)