一、首先安装依赖
PyPDF2==3.0.1
二、测试文件checkImgOrTxt.py
import PyPDF2
def is_scanned_pdf(file_path):
pdf_file = open(file_path, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
print(num_pages)
for page_num in range(num_pages):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
if not page_text.strip():
return True
return False
# 使用示例
pdf_file_path = 'd:\\444.pdf'
is_scanned = is_scanned_pdf(pdf_file_path)
if is_scanned:
print('该PDF文件为扫描图像')
else:
print('该PDF文件包含文本')
三、运行
python checkImgOrTxt.py