利用pdfplumber读取上市公司年度报告文本(包括表格)
获取数据
推荐从以下网址下载相应的文本数据,数据比较大,如果只是学习测试,自行选择几份文件下载即可:
https://modelscope.cn/datasets/modelscope/chatglm_llm_fintech_raw_dataset/summary
定义pdf文件读取器
结合https://github.com/dictmap/pdf_to_txt中的开源工具,定义如下PDF读取器类:
import pdfplumber
import re
from collections import defaultdict
import json
class PDFProcessor:
"""
上市公司年报PDF读取器(可扩展到其他pdf文本)
"""
def __init__(self, filepath):
self.filepath = filepath
self.pdf = pdfplumber.open(filepath)
self.all_text = defaultdict(dict) # 保存所有文本信息
self.allrow = 0 # 记录行索引
self.last_num = 0 # 记录行数(用于最后的页脚判别)
def check_lines(self, page, top, buttom):
"""
检查抽取的文本行,合并文本
top、buttom用于保持抽取的表格外文本和表格内文本不乱序
"""
lines = page.extract_words()[::]
text = ''
last_top = 0
last_check = 0
for l in range(len(lines)):
each_line = lines[l]
check_re = '(?:。|;|单位:元|单位:万元|币种:人民币|\d|报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$' # 上市公司年报特殊文本
if top == '' and buttom == '':
if abs(last_top - each_line['top']) <= 3: # 文本段是否同一行
text = text + each_line['text']
elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text): # 文本段是否属于特殊文本或另一段开头
text = text + each_line['text']
else:
if text == '':
text = each_line['text']
else:
text = text + '\n' + each_line['text']
elif top == '':
if each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 3:
text = text + each_line['text']
elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):
text = text + each_line['text']
else:
if text == '':
text = each_line['text']
else:
text = text + '\n' + each_line['text']
else:
if each_line['top'] < top and each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 3:
text = text + each_line['text']
elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):
text = text + each_line['text']
else:
if text == '':
text = each_line['text']
else:
text = text + '\n' + each_line['text']
last_top = each_line['top']
last_check = each_line['x1'] - page.width * 0.85
return text
def drop_empty_cols(self, data):
# 删除所有列为空数据的列
transposed_data = list(map(list, zip(*data)))
filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]
result = list(map(list, zip(*filtered_data)))
return result
def extract_text_and_tables(self, page):
buttom = 0
tables = page.find_tables()
# tables = page.find_tables(table_settings={'intersection_x_tolerance': 1})
if len(tables) >= 1:
count = len(tables)
for table in tables:
if table.bbox[3] < buttom:
pass
else:
count -= 1
top = table.bbox[1]
text = self.check_lines(page, top, buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'text', 'inside': text_list[_t]}
self.allrow += 1
buttom = table.bbox[3]
new_table = table.extract()
r_count = 0
for r in range(len(new_table)):
row = new_table[r]
if row[0] is None:
r_count += 1
for c in range(len(row)):
if row[c] is not None and row[c] not in ['', ' ']:
if new_table[r - r_count][c] is None:
new_table[r - r_count][c] = row[c]
else:
new_table[r - r_count][c] += row[c]
new_table[r][c] = None
else:
r_count = 0
end_table = []
for row in new_table:
if row[0] != None:
cell_list = []
cell_check = False
for cell in row:
if cell != None:
cell = cell.replace('\n', '')
else:
cell = ''
if cell != '':
cell_check = True
cell_list.append(cell)
if cell_check == True:
end_table.append(cell_list)
end_table = self.drop_empty_cols(end_table)
for row in end_table:
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'excel', 'inside': str(row)}
self.allrow += 1
if count == 0:
text = self.check_lines(page, '', buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'text', 'inside': text_list[_t]}
self.allrow += 1
else:
text = self.check_lines(page, '', '')
text_list = text.split('\n')
for _t in range(len(text_list)):
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'text', 'inside': text_list[_t]}
self.allrow += 1
# 设置页眉和页脚类型标识,此处针对上市公司年报数据定制,不同文件页眉、页脚不同,可自行修改设定
first_re = '[^计](?:报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'
end_re = '^(?:\d|\\|\/|第|共|页|-|_| ){1,}'
if self.last_num == 0:
try:
first_text = str(self.all_text[0]['inside'])
end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
if re.search(first_re, first_text) and not '[' in end_text:
self.all_text[0]['type'] = '页眉'
if re.search(end_re, end_text) and not '[' in end_text:
self.all_text[len(self.all_text) - 1]['type'] = '页脚'
except:
print(page.page_number)
else:
try:
first_text = str(self.all_text[self.last_num + 1]['inside'])
end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
if re.search(first_re, first_text) and '[' not in end_text:
self.all_text[self.last_num + 1]['type'] = '页眉'
if re.search(end_re, end_text) and '[' not in end_text:
self.all_text[len(self.all_text) - 1]['type'] = '页脚'
except:
print(page.page_number)
self.last_num = len(self.all_text) - 1
def process_pdf(self):
# 读取pdf文本信息
for i in range(len(self.pdf.pages)):
self.extract_text_and_tables(self.pdf.pages[i])
def save_all_text(self, path):
# 保存提取的文本信息
for key in self.all_text.keys():
with open(path, 'a+', encoding='utf-8') as file:
file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n')
读取文本信息并保存
def process_file(file_path):
"""
读取pdf文本信息并保存
"""
print('start ', file_path)
processor = PDFProcessor(file_path)
processor.process_pdf() # 读取文本信息
save_path = './' + file_path.split('/')[-1].replace('.pdf', '.txt')
processor.save_all_text(save_path)
print('finish ', save_path)
process_file('./年度报告.pdf') # 文件读取路径和保存路径自行修改
经过上述的读取和保存可以得到以下结果内容:
其中page为第几页,allrow是行数,type标识文本信息的类型(excel为表格文本类型),inside是文本内容。
合并文本并保存为大模型支持的文本格式
定义表格文本转为markdown格式
现阶段大模型对于markdown格式的表格文本数据的支持是比较好的,定义如下函数进行转换:
# 将表格转换为markdown格式字符串
def table_converter(table):
table_string = ''
# 遍历表格的每一行
for row_num in range(len(table)):
row = table[row_num]
# 从warp的文字删除线路断路器
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
# 将表格转换为字符串,注意'|'、'\n'
table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
# 删除最后一个换行符
table_string = table_string[:-1]
return table_string
合并文本,删除页眉、页脚等
import jsonlines
# 专门用于读取和保存上述字典字符串的文本信息数据
def read_jsonl(path):
content = []
with jsonlines.open(path, "r") as json_file:
for obj in json_file.iter(type=dict, skip_invalid=True):
content.append(obj)
return content
def write_jsonl(path, content):
with jsonlines.open(path, "w") as json_file:
json_file.write_all(content)
# 读取文本信息数据
texts = read_jsonl('./年度报告.txt')
contents = ''
table_data = []
for item in texts:
if item['inside'] == '' or item['type'] == '页眉' or item['type'] == '页脚':
pass
elif item['type'] == 'excel':
table_data.append(eval(item['inside']))
else:
if table_data:
table_str = table_converter(table_data)
contents += table_str + '\n'
contents += item['inside'] + '\n'
table_data = []
print(contents) # 打印输出文本
# 保存为大模型可读取的txt文本文件
with open('./test_contents.txt', 'w', encoding='utf-8') as f:
f.write(contents)
最终得到的部分文本如下所示:
综上所述,根据不同的pdf内容,可以定制合适的pdf文件读取器,经过转换最终得到适合大模型读取的知识库文档文件内容。