python读pdf/双层pdf
import pyocr
import importlib
import sys
import time
importlib.reload(sys)
time1 = time.time()
import os.path
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
text_path = r'D009-2009-090-0710_OCR.pdf'
def parse():
'''解析PDF文本,并保存到TXT文件中'''
fp = open(text_path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if (isinstance(x, LTTextBoxHorizontal)):
with open(r'2.txt', 'a') as f:
results = x.get_text()
print(results)
f.write(results + "\n")
if __name__ == '__main__':
parse()
time2 = time.time()
print("总共消耗时间为:", time2 - time1)