python读取PDF

最新推荐文章于 2024-05-12 19:17:44 发布

啦啦啦啦～。～

最新推荐文章于 2024-05-12 19:17:44 发布

阅读量435

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_44718730/article/details/107855945

版权

python读取PDF文件

coding=utf-8
import os

import xlsxwriter
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import requests

#pip install pdfminer3k
def each_pdf(path):
# 获取文档对象，示例pdf在 https://pypi.org/project/pdfminer3k/1.0.1/#files 下载后的文件夹中 pdfminer3k-master\samples\nonfree
fp = open(path, “rb”)
print(path)
# print(type(fp))

# 创建一个与文档关联的解释器
parser = PDFParser(fp)

# PDF 文档的对象
doc = PDFDocument()

# 连接解释器与文档对象
parser.set_document(doc)
doc.set_parser(parser)

# 初始化文档
doc.initialize("")

# 创建PDF资源管理器
resource = PDFResourceManager()

# 参数分析器
laparam = LAParams()

# 创建一个聚合器
device = PDFPageAggregator(resource, laparams = laparam)

# 页面解释器
interpreter = PDFPageInterpreter(resource, device)
content=[]
# 使用文档对象得到页面的集合
for page in doc.get_pages():
    # 使用页面解释器来读取
    interpreter.process_page(page)

    # 使用聚合器获得内容
    layout = device.get_result()
    for out in layout:

       if hasattr(out, "get_text"):
             content.append(out.get_text().replace('\n',''))
str ='无'
if '曾用名：' and '疑似实际控制人' in content:
  str=(content[content.index('曾用名：')+1]+'|'+content[content.index('疑似实际控制人')+1])
print(str)
return str

filenameAll = []
with open(‘data/nb.txt’, ‘r’, encoding=‘utf-8’) as file:
f = file.readlines()
for j in f:
filenameAll.append(j.strip())
file.close()

allContent=[]
for dirpath, dirnames, filenames in os.walk(‘pdf’):
for i in filenames:
if i in filenameAll:
content=each_pdf(dirpath+’/’+i)
if content != ‘无’:
allContent.append(content)

workbook = xlsxwriter.Workbook(‘疑似实际控制人2.xlsx’) #生成表格
worksheet = workbook.add_worksheet(u’sheet1’) #在文件中创建一个名为TEST的sheet,不加名字默认为sheet1

for i in range(len(allContent)):
for j in range(len(allContent[i].split(’|’))):
worksheet.write(i,j,allContent[i].split(’|’)[j]) #循环写处理后的数据生成的列表

workbook.close()

啦啦啦啦～。～

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python读取PDF

python读取PDF文件coding=utf-8import osimport xlsxwriterfrom pdfminer.pdfparser import PDFParser, PDFDocumentfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfdevice import PDFDevicefrom pdfminer.layout import LAParams
复制链接

扫一扫