python PyPDF2处理PDF文件
- 安装PyPDF2
pip install PyPDF2
- 官方文档:
- 导入模块
import PyPDF2
1. 读取PDF中的内容
# author:mlnt
# createdate:2022/8/16
import PyPDF2 # 导入PyPDF2模块
# 官方文档:https://pypi.org/project/PyPDF2/
# 1.打开PDF文件
pdf = open(file='test.pdf', mode='rb') # 以二进制方式打开
# 2.获取PDF文件的页数
# 打开PDF文件成功后,可使用PdfFileReader()方法读取PDF内容
pdf_reader = PyPDF2.PdfFileReader(pdf) # 读取PDF内容
print(f'PDF页数为:{pdf_reader.numPages}')
print(f'PDF页数为:{len(pdf_reader.pages)}')
# 3.读取PDF页面内容
"""
- 使用PdfFileReader()方法读取PDF文件后,可使用getPage(n)(或pages[n])获取第n页的PDF内容
- PDF页面从第0页开始计算
- 页面内容被读入后,可使用extractText()取得该页的字符串内容
"""
for i in range(pdf_reader.numPages):
pageObj = pdf_reader.getPage(i) # 读取第i页内容
# pageObj = pdf_reader.pages[i] # 读取第i页内容
page_content = pageObj.extractText() # 提取页面内容
print(page_content)
test.pdf:
读取效果:
2. PDF简单加密与解密
# author:mlnt
# createdate:2022/8/16
import PyPDF2
from PyPDF2 import PdfReader, PdfWriter
def checkEncrypted(filename):
"""检查文件是否加密"""
pdfObj = open(file=filename, mode='rb') # 以二进制方式打开
pdfRd = PyPDF2.PdfFileReader(pdfObj) # 读取PDF内容
# 判断文件是否加密
if pdfRd.isEncrypted:
print(f'{filename}属于加密文件')
else:
print(f'{filename}未加密')
def pdfEncrypt(filename):
"""pdf加密"""
reader = PdfReader(filename) # 创建RdfReader对象
writer = PdfWriter() # 创建PdfWriter对象
# 将所有页面添加到writer
for page in reader.pages:
# PDF页面旋转
page.rotate_clockwise(90) # 旋转90度
page.rotate_clockwise(-180) # 逆时针旋转180度
writer.add_page(page)
# 添加密码
writer.encrypt('123456')
new_name = filename[:filename.rfind('.pdf')] + '-encrypted.pdf'
# 保存pdf
with open(new_name, 'wb') as f:
writer.write(f)
def pdfDecrypt(filename):
"""pdf解密"""
reader = PdfReader(filename)
writer = PdfWriter()
# 判断是否加密
if reader.isEncrypted:
reader.decrypt('123456')
for page in reader.pages:
writer.add_page(page)
new_name = filename[:filename.rfind('.pdf')] + '-decrypted.pdf'
# 保存pdf
with open(new_name, 'wb') as f:
writer.write(f)
checkEncrypted('test.pdf')
pdfEncrypt('test.pdf')
pdfDecrypt('test-encrypted.pdf')
3. 合并PDF文件
# author:mlnt
# createdate:2022/8/16
from PyPDF2 import PdfMerger
merger = PdfMerger() # 创建PdfMerger对象
pdf1 = open('test.pdf', 'rb')
pdf2 = open('watermark.pdf', 'rb')
# 将整个文档添加到merger
merger.append(fileobj=pdf1)
# 将pdf2插入到merger,从第0页开始
merger.merge(position=0, fileobj=pdf2)
with open('merger-test.pdf', 'wb') as f:
merger.write(f)
merger.close()
watermark.pdf:
合并效果:
4.处理PDF页面重叠
# author:mlnt
# createdate:2022/8/16
import PyPDF2
from PyPDF2 import PdfReader, PdfWriter
reader1 = PdfReader('test.pdf') # 创建RdfReader对象
reader2 = PdfReader('watermark.pdf') # 创建RdfReader对象
writer = PdfWriter() # 创建PdfWriter对象
# 将所有页面添加到writer
for page in reader1.pages:
# 执行重叠合并
page.merge_page(reader2.pages[0])
# 添加到新的PDF对象
writer.add_page(page)
# 保存pdf
with open('test-watermark.pdf', 'wb') as f:
writer.write(f)
效果:
5. 添加水印到pdf
from pathlib import Path
from typing import Union, List
from PyPDF2 import PdfWriter, PdfReader
from typing_extensions import Literal
def watermark(
content_pdf: Path,
watermark_pdf: Path,
pdf_result: Path,
page_indices: Union[Literal["ALL"], List[int]] = "ALL",
):
"""
添加水印
:param content_pdf: 要添加水印的pdf文件
:param watermark_pdf: 水印pdf
:param pdf_result: 处理后保存的文件名
:param page_indices: 页面目录
:return:
"""
reader = PdfReader(content_pdf)
if page_indices == "ALL":
page_indices = list(range(0, len(reader.pages)))
writer = PdfWriter()
for index in page_indices:
content_page = reader.pages[index]
mediabox = content_page.mediabox
# 读取水印pdf文件
reader_watermark = PdfReader(watermark_pdf)
# 获取水印pdf文件第1页的内容
image_page = reader_watermark.pages[0]
# 将两个页面的内容流合并为一个
image_page.merge_page(content_page)
image_page.mediabox = mediabox
writer.add_page(image_page)
# 保存处理后的内容
with open(pdf_result, "wb") as fp:
writer.write(fp)
watermark(content_pdf='test.pdf', watermark_pdf='watermark.pdf', pdf_result='result.pdf')
效果: