相关模块
提取pdf文字
pdfplumber.open(PDF路径)
pdf.pages[页数]
pdf.extract_text()
#读取第一页 import pdfplumber with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf: first_page = pdf.pages[0] print(first_page.extract_text())
sublime可能会出现
UnicodeEncodeError: 'gbk' codec can't encode character '\u2022' in position 590: illegal multibyte sequence
用vscode就好了
#读取全部页 import pdfplumber with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf: for page in pdf.pages: print(page.extract_text())
分割pdf
PdfFileReader() , PdfFileWriter()
from PyPDF2 import PdfFileReader,PdfFileWriter import os os.mkdir('分割后的pdf文件') pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf') for page in range(pdf_reader.getNumPages()): pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf_reader.getPage(page)) #getpage函数获得具体的某一页的内容 with open(f'./分割后的pdf文件/Netease Q2 2019 Earnings {page}.pdf','wb')as out: pdf_writer.write(out)
合并pdf文件
from PyPDF2 import PdfFileReader,PdfFileWriter pdf_writer = PdfFileWriter() for page in range(16): pdf_reader = PdfFileReader(f'./分割后的pdf文件/Netease Q2 2019 Earnings {page}.pdf') for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) with open('merged.pdf','wb')as out: pdf_writer.write(out)
旋转pdf
.rotateClockwise(90的倍数) 顺时针旋转
.rotateCounterClockwise(90的倍数) 逆时针旋转
from PyPDF2 import PdfFileReader,PdfFileWriter pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf') pdf_writer = PdfFileWriter() page = pdf_reader.getPage(0).rotateClockwise(90) pdf_writer.addPage(page) page = pdf_reader.getPage(1).rotateCounterClockwise(90) pdf_writer.addPage(page) with open('rotated.pdf','wb') as out: pdf_writer.write(out)
排序pdf
直接按照期望的顺序添加页面即可,此处例子为倒序排列
from PyPDF2 import PdfFileReader,PdfFileWriter pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf') pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()-1, -1 ,-1): pdf_writer.addPage(pdf_reader.getPage(page)) with open('ordered.pdf','wb') as out: pdf_writer.write(out)
pdf添加水印
首先制作水印文件
先拿word制作一个只有水印的页面,放在你想放水印的位置,保存为pdf
然后合并pdf
打开水印pdf文件和要加水印的pdf文件,再创建一个pdf写入器
然后合并pdf水印
对每一页都进行合并水印的操作,注意.mergePage()方法合成的页面顺序
下面的内容.mergePage(出现在上面的内容)
from PyPDF2 import PdfFileReader,PdfFileWriter from copy import copy watermark_pdf = PdfFileReader('mark.pdf') watermark_page = watermark_pdf.getPage(0) pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf') pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): original_page = pdf_reader.getPage(page) new_page = copy(watermark_page) new_page.mergePage(original_page) pdf_writer.addPage(new_page) with open('watermark.pdf','wb') as out: pdf_writer.write(out)
pdf加密
pdf_writer.encrypt(密码)
from PyPDF2 import PdfFileReader,PdfFileWriter pdf_reader = PdfFileReader('Netease Q2 2019 Earnings Release-Final.pdf') pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt('hxx') with open('encrypted.pdf','wb') as out: pdf_writer.write(out)
解密pdf
pdf_reader.decrypt(密码)
from PyPDF2 import PdfFileReader,PdfFileWriter pdf_reader = PdfFileReader('encrypted.pdf') pdf_reader.decrypt('hxx') pdf_writer = PdfFileWriter() for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) with open('decrypted.pdf','wb') as out: pdf_writer.write(out)