利用pdfplumber读取上市公司年度报告文本(包括表格)

利用pdfplumber读取上市公司年度报告文本(包括表格)

获取数据

推荐从以下网址下载相应的文本数据,数据比较大,如果只是学习测试,自行选择几份文件下载即可:

https://modelscope.cn/datasets/modelscope/chatglm_llm_fintech_raw_dataset/summary
在这里插入图片描述

定义pdf文件读取器

结合https://github.com/dictmap/pdf_to_txt中的开源工具,定义如下PDF读取器类:

import pdfplumber
import re
from collections import defaultdict
import json

class PDFProcessor:
    """
    上市公司年报PDF读取器(可扩展到其他pdf文本)
    """
    def __init__(self, filepath):
        self.filepath = filepath
        self.pdf = pdfplumber.open(filepath)
        self.all_text = defaultdict(dict)  # 保存所有文本信息
        self.allrow = 0  # 记录行索引
        self.last_num = 0  # 记录行数(用于最后的页脚判别)

    def check_lines(self, page, top, buttom):
        """
        检查抽取的文本行,合并文本
        top、buttom用于保持抽取的表格外文本和表格内文本不乱序
        """
        lines = page.extract_words()[::]
        text = ''
        last_top = 0
        last_check = 0
        for l in range(len(lines)):
            each_line = lines[l]
            check_re = '(?:。|;|单位:元|单位:万元|币种:人民币|\d|报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'  # 上市公司年报特殊文本
            if top == '' and buttom == '':
                if abs(last_top - each_line['top']) <= 3:  # 文本段是否同一行
                    text = text + each_line['text']
                elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text):  # 文本段是否属于特殊文本或另一段开头

                    text = text + each_line['text']
                else:
                    if text == '':
                        text = each_line['text']
                    else:
                        text = text + '\n' + each_line['text']
            elif top == '':
                if each_line['top'] > buttom:
                    if abs(last_top - each_line['top']) <= 3:
                        text = text + each_line['text']
                    elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):
                        text = text + each_line['text']
                    else:
                        if text == '':
                            text = each_line['text']
                        else:
                            text = text + '\n' + each_line['text']
            else:
                if each_line['top'] < top and each_line['top'] > buttom:
                    if abs(last_top - each_line['top']) <= 3:
                        text = text + each_line['text']
                    elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):
                        text = text + each_line['text']
                    else:
                        if text == '':
                            text = each_line['text']
                        else:
                            text = text + '\n' + each_line['text']
            last_top = each_line['top']
            last_check = each_line['x1'] - page.width * 0.85

        return text

    def drop_empty_cols(self, data):
        # 删除所有列为空数据的列
        transposed_data = list(map(list, zip(*data)))
        filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]
        result = list(map(list, zip(*filtered_data)))
        return result

    def extract_text_and_tables(self, page):
        buttom = 0
        tables = page.find_tables()
        # tables = page.find_tables(table_settings={'intersection_x_tolerance': 1})

        if len(tables) >= 1:
            count = len(tables)
            for table in tables:
                if table.bbox[3] < buttom:
                    pass
                else:
                    count -= 1
                    top = table.bbox[1]
                    text = self.check_lines(page, top, buttom)
                    text_list = text.split('\n')
                    for _t in range(len(text_list)):
                        self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                      'type': 'text', 'inside': text_list[_t]}
                        self.allrow += 1

                    buttom = table.bbox[3]

                    new_table = table.extract()
                    r_count = 0
                    for r in range(len(new_table)):
                        row = new_table[r]
                        if row[0] is None:
                            r_count += 1
                            for c in range(len(row)):
                                if row[c] is not None and row[c] not in ['', ' ']:
                                    if new_table[r - r_count][c] is None:
                                        new_table[r - r_count][c] = row[c]
                                    else:
                                        new_table[r - r_count][c] += row[c]
                                    new_table[r][c] = None
                        else:
                            r_count = 0

                    end_table = []
                    for row in new_table:
                        if row[0] != None:
                            cell_list = []
                            cell_check = False
                            for cell in row:
                                if cell != None:
                                    cell = cell.replace('\n', '')
                                else:
                                    cell = ''
                                if cell != '':
                                    cell_check = True
                                cell_list.append(cell)
                            if cell_check == True:
                                end_table.append(cell_list)
                    end_table = self.drop_empty_cols(end_table)

                    for row in end_table:
                        self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                      'type': 'excel', 'inside': str(row)}
                        self.allrow += 1

                    if count == 0:
                        text = self.check_lines(page, '', buttom)
                        text_list = text.split('\n')
                        for _t in range(len(text_list)):
                            self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                                          'type': 'text', 'inside': text_list[_t]}
                            self.allrow += 1

        else:
            text = self.check_lines(page, '', '')
            text_list = text.split('\n')
            for _t in range(len(text_list)):
                self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
                                              'type': 'text', 'inside': text_list[_t]}
                self.allrow += 1

        # 设置页眉和页脚类型标识,此处针对上市公司年报数据定制,不同文件页眉、页脚不同,可自行修改设定
        first_re = '[^计](?:报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'
        end_re = '^(?:\d|\\|\/|第|共|页|-|_| ){1,}'
        if self.last_num == 0:
            try:
                first_text = str(self.all_text[0]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if re.search(first_re, first_text) and not '[' in end_text:
                    self.all_text[0]['type'] = '页眉'
                    if re.search(end_re, end_text) and not '[' in end_text:
                        self.all_text[len(self.all_text) - 1]['type'] = '页脚'
            except:
                print(page.page_number)
        else:
            try:
                first_text = str(self.all_text[self.last_num + 1]['inside'])
                end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
                if re.search(first_re, first_text) and '[' not in end_text:
                    self.all_text[self.last_num + 1]['type'] = '页眉'
                if re.search(end_re, end_text) and '[' not in end_text:
                    self.all_text[len(self.all_text) - 1]['type'] = '页脚'
            except:
                print(page.page_number)

        self.last_num = len(self.all_text) - 1

    def process_pdf(self):
        # 读取pdf文本信息
        for i in range(len(self.pdf.pages)):
            self.extract_text_and_tables(self.pdf.pages[i])

    def save_all_text(self, path):
        # 保存提取的文本信息
        for key in self.all_text.keys():
            with open(path, 'a+', encoding='utf-8') as file:
                file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n')

读取文本信息并保存

def process_file(file_path):
    """
    读取pdf文本信息并保存
    """
    print('start ', file_path)
    processor = PDFProcessor(file_path)
    processor.process_pdf()  # 读取文本信息
    save_path = './' + file_path.split('/')[-1].replace('.pdf', '.txt')
    processor.save_all_text(save_path)
    print('finish ', save_path)

process_file('./年度报告.pdf')  # 文件读取路径和保存路径自行修改

经过上述的读取和保存可以得到以下结果内容:
在这里插入图片描述

其中page为第几页,allrow是行数,type标识文本信息的类型(excel为表格文本类型),inside是文本内容。

合并文本并保存为大模型支持的文本格式

定义表格文本转为markdown格式

现阶段大模型对于markdown格式的表格文本数据的支持是比较好的,定义如下函数进行转换:

# 将表格转换为markdown格式字符串
def table_converter(table):
    table_string = ''
    # 遍历表格的每一行
    for row_num in range(len(table)):
        row = table[row_num]
        # 从warp的文字删除线路断路器
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        # 将表格转换为字符串,注意'|'、'\n'
        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
    # 删除最后一个换行符
    table_string = table_string[:-1]
    return table_string

合并文本,删除页眉、页脚等

import jsonlines

# 专门用于读取和保存上述字典字符串的文本信息数据
def read_jsonl(path):
    content = []
    with jsonlines.open(path, "r") as json_file:
        for obj in json_file.iter(type=dict, skip_invalid=True):
            content.append(obj)
    return content

def write_jsonl(path, content):
    with jsonlines.open(path, "w") as json_file:
        json_file.write_all(content)

# 读取文本信息数据
texts = read_jsonl('./年度报告.txt')

contents = ''
table_data = []
for item in texts:
    if item['inside'] == '' or item['type'] == '页眉' or item['type'] == '页脚':
        pass
    elif item['type'] == 'excel':
        table_data.append(eval(item['inside']))
    else:
        if table_data:
            table_str = table_converter(table_data)
            contents += table_str + '\n'
        contents += item['inside'] + '\n'
        table_data = []

print(contents)  # 打印输出文本
# 保存为大模型可读取的txt文本文件
with open('./test_contents.txt', 'w', encoding='utf-8') as f:
    f.write(contents)

最终得到的部分文本如下所示:
在这里插入图片描述

综上所述,根据不同的pdf内容,可以定制合适的pdf文件读取器,经过转换最终得到适合大模型读取的知识库文档文件内容。

  • 21
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值