python读取PDF文件
coding=utf-8
import os
import xlsxwriter
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import requests
#pip install pdfminer3k
def each_pdf(path):
# 获取文档对象,示例pdf在 https://pypi.org/project/pdfminer3k/1.0.1/#files 下载后的文件夹中 pdfminer3k-master\samples\nonfree
fp = open(path, “rb”)
print(path)
# print(type(fp))
# 创建一个与文档关联的解释器
parser = PDFParser(fp)
# PDF 文档的对象
doc = PDFDocument()
# 连接解释器与文档对象
parser.set_document(doc)
doc.set_parser(parser)
# 初始化文档
doc.initialize("")
# 创建PDF资源管理器
resource = PDFResourceManager()
# 参数分析器
laparam = LAParams()
# 创建一个聚合器
device = PDFPageAggregator(resource, laparams = laparam)
# 页面解释器
interpreter = PDFPageInterpreter(resource, device)
content=[]
# 使用文档对象得到页面的集合
for page in doc.get_pages():
# 使用页面解释器来读取
interpreter.process_page(page)
# 使用聚合器获得内容
layout = device.get_result()
for out in layout:
if hasattr(out, "get_text"):
content.append(out.get_text().replace('\n',''))
str ='无'
if '曾用名:' and '疑似实际控制人' in content:
str=(content[content.index('曾用名:')+1]+'|'+content[content.index('疑似实际控制人')+1])
print(str)
return str
filenameAll = []
with open(‘data/nb.txt’, ‘r’, encoding=‘utf-8’) as file:
f = file.readlines()
for j in f:
filenameAll.append(j.strip())
file.close()
allContent=[]
for dirpath, dirnames, filenames in os.walk(‘pdf’):
for i in filenames:
if i in filenameAll:
content=each_pdf(dirpath+’/’+i)
if content != ‘无’:
allContent.append(content)
workbook = xlsxwriter.Workbook(‘疑似实际控制人2.xlsx’) #生成表格
worksheet = workbook.add_worksheet(u’sheet1’) #在文件中创建一个名为TEST的sheet,不加名字默认为sheet1
for i in range(len(allContent)):
for j in range(len(allContent[i].split(’|’))):
worksheet.write(i,j,allContent[i].split(’|’)[j]) #循环写处理后的数据生成的列表
workbook.close()