您正在每个布局对象上查找bbox属性。PDFMiner文档中有一些关于how to parse the layout hierarchy的信息,但它并没有涵盖所有内容。
下面是一个例子:from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure
def parse_layout(layout):
"""Function to recursively parse the layout tree."""
for lt_obj in layout:
print(lt_obj.__class__.__name__)
print(lt_obj.bbox)
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
print(lt_obj.get_text())
elif isinstance(lt_obj, LTFigure):
parse_layout(lt_obj) # Recursive
fp = open('example.pdf', 'rb')
pars