对于AcroForm
类型的PDF
,想要提取内容,python
的pdfminer
库可以实现。如下图,是一份确认单,为AcroForm
类型的PDF
。
代码示例
# 安装pdfminer:pip install pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from pdfminer.psparser import PSLiteral, PSKeyword
from pdfminer.utils import decode_text
data = {}
pdf_path = r"E:\项目目录\嘉实\一年持有期混合型基金.pdf"
def decode_value(value):
# decode PSLiteral, PSKeyword
if isinstance(value, (PSLiteral, PSKeyword)):
value = value.name
# decode bytes
if isinstance(value, bytes):
value = decode_text(value)
return value
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
res = resolve1(doc.catalog)
# print(res)
# 是否为AcroForm类型表单
if 'AcroForm' not in res:
raise ValueError("No AcroForm Found")
# 提取字段
fields = resolve1(doc.catalog['AcroForm'])['Fields'] # may need further resolving
for f in fields:
field = resolve1(f)
print(field)
name, values = field.get('TU'), field.get('V')
# decode name
name = decode_text(name)
# resolve indirect obj
values = resolve1(values)
# decode value(s)
if isinstance(values, list):
values = [decode_value(v) for v in values]
else:
values = decode_value(values)
data.update({name: values})
print(data)
"""
{
'基金账号': '07Ixxxxx28', '交易账号': '01xxxxx96', '基金代码': '003458', '基金名称': '嘉实稳宏债券A',
'原申请单编号': '2002xxxxxx42', '单位净值': '1.5332', '手续费': '15430.76', '申请份额': '10064416.56',
'成功份额': '10064416.56', '成功金额': '15415332.70', '未付收益': '', '处理结果': '确认成功', '失败原因': '',
'操作员': '嘉实直销柜台', '打印日期': '20230217', '邮编': '200120', '电话': '02xxxx93',
'经办人': '028xxxxxx', '地址': '上海市xxxxx号', '申请日期': '20230216', '确认日期': '20230217',
'确认编号': '201xxxxxx3', '投资者姓名/名称': 'xxxxx一期混合型基金(FOF)'
}
"""