安装pdfminer
python3安装pdfminer3k, python2安装pdfminer2k
具体实现
def extract_layout_by_page(pdf_path):
# 提取页面布局
# 创建pdf设备对象
laparams = LAParams()
fp = open(pdf_path, 'rb') # 以二進制的模式打開
# 用文件对象来创建一个pdf文档分析器
parser = PDFParser(fp)
# 创建一个pdf文档
document = PDFDocument(parser)
# 连接分析器 与文档对象
parser.set_document(document)
document.set_parser(parser)
document.initialize()
# 检查文档是否提供txt装换,不提供就忽略
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# 创建PDF资源管理器 来共享资源
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建一个pdf解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
layouts = []
# 循环遍历列表,每次处理page的内容
for page in document.get_pages():
interpreter.process_page(page)
layouts.append(device.get_result())
return layouts
def pdf_to_txt(pdf_path, newname1):
page_layouts = extract_layout_by_page(pdf_path)
with open(newname1, 'w', encoding='utf-8') as f:
for current_page in page_layouts:
# 获取文本
for x in current_page:
if hasattr(x, "get_text"):
# result.append(x.get_text())
results = x.get_text()
# print(type(results))
if results == ' \n':
continue
# results=list(results)
# with open(newname1, 'a', encoding='utf-8') as f:
f.write(results)
# f.close()
# 恢复文本原来换行
with open(newname1, 'r', encoding='UTF-8-sig') as f:
content = f.readlines()
print(content)
pattern = re.compile(r'\s{1,2}\n')
newcontent = []
for i in content:
if bool(re.search(pattern, i)) is False:
j = i.replace('\n', '').replace(' \n', '')
newcontent.append(j)
else:
newcontent.append(i)
f.close()
print(newcontent)
with open(newname1, 'w', encoding='UTF-8') as f:
for i in newcontent:
f.write(i)
f.close()
return newname1