提取pdf文字
pdfplumber.open(PDF路径)
pdf.pages[页数]
pdf.extract_text()
#读取第一页
import pdfplumber
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())
sublime可能会出现
UnicodeEncodeError: 'gbk' codec can't encode character '\u2022' in position 590: illegal multibyte sequence
用vscode就好了
#读取全部页
import pdfplumber
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
for page in pdf.pages:
print(page.extract_text())
分割pdf
PdfFileReader() , PdfFileWriter()
from PyPDF2 import PdfFileReader,PdfFileWriter