PDF解析

最新推荐文章于 2024-02-28 19:54:04 发布
weixin_42405511
最新推荐文章于 2024-02-28 19:54:04 发布
阅读量110
点赞数
文章标签： python
本文链接：https://blog.csdn.net/weixin_42405511/article/details/118225980
版权
import pdfplumber
import re
import os
import pandas as pd
import openpyxl as xl
from docx import Document


class parseFile():
    def __init__(self):
        self.chapter_regex = [
            r'\s*\d+\.\s+(.*?)？\s+',
            r'\s+\d+：(.*?)？\s+',
            r'\s+\d+、(.*?)答：',
            r'\s*[一二三四五六七八九十]+、(.*?)\s+',
            r'\s*问：\s*(.*?)答：',
            r'\s*问题\d+：\s*(.*?)回复：',
            r'\s*\d+、\s*(.*?)\s+'
        ]
        self.doc_regex = [
            r'\s+(.*?)？'
        ]
        self.fileFlag = 0
        self.CODE = ''
        self.anwTime = ''
        self.file_name = ''
        self.resultList = []
        self.errorList = []

    def getRegexRes(self, preList):
        preLi = []
        for preres in preList:
            for regex_str in self.chapter_regex:
                pattern = re.compile(regex_str, re.DOTALL)
                preLi = re.findall(pattern, preres)
                if not preLi:
                    continue
                print('----------------正则匹配结果：')
                print(preLi)
                for id, i in enumerate(preLi):
                    print(id + 1, i)
            if not preLi:
                return preLi
            else:
                self.getRegexRes(preLi)

    def getIndex(self, preLi, preStr):
        for index, row in enumerate(preLi):
            if preStr in row:
                return index

    def getQueAndAnw(self, textBlock):
        print(textBlock)
        regexList = []
        queLi = []
        anwLi = []
        # textStr = '{]'.join(textBlock)
        textStr = '\t'.join(textBlock)
        print(textStr)
        if self.fileFlag:
            regexList = self.doc_regex
        else:
            regexList = self.chapter_regex
        for regex_str in regexList:
            pattern = re.compile(regex_str, re.DOTALL)
            preLi = re.findall(pattern, textStr)
            if not preLi:
                continue
            for id, row in enumerate(preLi):
                row = row.split('提问及回复')[1] if '提问及回复' in row else row
                if '\t' in row:
                    leftStr = row.split('\t\t')[0]
                    rightStr = row.split('\t\t')[1]
                    if '？' not in rightStr:
                        row = leftStr
                queLi.append(row)
            break

        # 打印结果
        for id, row in enumerate(queLi):
            print('-----第{}个匹配结果-----'.format(id + 1))
            print(row)

            if id + 1 == len(queLi):
                anwserRes = textStr.split(queLi[id])[1]
                print('-----答案提取结果-----\n{}'.format(anwserRes))
                anwLi.append(anwserRes)
            else:
                # startId = self.getIndex(textBlock, queLi[id])
                # endId = self.getIndex(textBlock, queLi[id+1])
                # if startId + 2 == endId:
                #     anwserRes = textBlock[startId+1]
                anw_pattern = re.compile(r'{}(.*?){}'.format(re.escape(queLi[id]), re.escape(queLi[id+1])), re.DOTALL)
                anwserRes = re.findall(anw_pattern, textStr)
                if anwserRes:
                    anwserStr = anwserRes[0]
                    print('-----答案提取结果-----\n{}'.format(anwserStr))
                    anwLi.append(anwserStr)
                else:
                    print('第{}个未匹配上'.format(id + 1))

        if len(queLi) == len(anwLi) and len(queLi) != 0:
            print('提取成功！')
            for row in zip(queLi, anwLi):
                self.resultList.append([self.CODE, self.anwTime, self.anwTime, row[0].strip().replace('\n', ''), row[1].strip().replace('\n', '').replace('？', '').replace('{]', ''), self.file_name])
        else:
            print('该附件提取存在问题：{}'.format(self.file_name))
            self.errorList.append(self.file_name)

    def wtiteToTxt(self, path, resultList):
        with open(path, 'a+') as f:
            for row in resultList:
                f.write(row + '\n')

    def parseTable(self, tableList):
        try:
            timeStr = ''
            for index, row in enumerate(tableList):
                if not row:
                    continue
                if '投资者关系活动' and '内容介绍' in row.replace('\n', '').replace('\r', '').replace('\t', ''):
                    # print(index, row)
                    startIndex = index
                if '附件清单' in row.replace('\n', '').replace('\r', '').replace('\t', ''):
                    # print(index, row)
                    endIndex = index
                    break
                if '时间' in row.replace('\n', '').replace('\r', '').replace('\t', '').replace(' ', '') and not timeStr:
                    timeStr = tableList[index + 1]
                    # print(timeStr)

            if timeStr:
                try:
                    pre_anw_time = re.findall(r'(\d+)年\s*(\d+)\s*月\s*(.*?)日', timeStr)
                    if len(pre_anw_time[0]) == 3:
                        pre_list = [row.split('-')[0] if '-' in row else row for row in pre_anw_time[0]]
                        self.anwTime = '-'.join(pre_list)

                except Exception as e:
                    print(e)
                    self.anwTime = ''
                    print('正则匹配日期异常')

            textBlock = tableList[startIndex + 1:endIndex]
            # print(textBlock)
            self.getQueAndAnw(textBlock)

        except Exception as e:
            print(e)
            print('该附件不满足条件，请检查')

    def parsePdf(self, path):
        '''
        函数入口
        遍历文件，将所有数据合并到一个list
        :param path:
        :return:
        '''
        tableList = []
        try:
            with pdfplumber.open(path) as f:
                for page in f.pages:
                    # page:每一页数据
                    for table in page.extract_tables():
                        # print(table)
                        # table:每一个表格
                        for li in table:
                            # print(li)
                            # 每一行数据
                            for row in li:
                                # preLi = row.split()
                                tableList.append(row)

            print(tableList)
            self.parseTable(tableList)
        except Exception as e:
            print(e)
            print('--------------该附件无法打开')
            self.errorList.append('无法打开' + self.file_name)

    def parseWord(self, path):
        tableList = []
        doc = Document(path)
        tbs = doc.tables
        for tb in tbs:
            for row in tb.rows:
                for cell in row.cells:
                    tableList.append(cell.text)
                    # print(cell.text)
                    # 自动编号判断标志
                    # num_flag =
        # print(tableList)
        self.parseTable(tableList)

    def writeToExcel(self, lastResultList):
        # 写入结果文件
        resultPath = './researchExcel.xlsx'
        workbook = xl.Workbook()
        workbook.save(resultPath)
        sheet = workbook.active
        # 添加标题栏
        headers = ['CODE', u'提问时间', u'回答时间', u'问题内容', u'回答', u'解析文件名']
        sheet.append(headers)
        for res in lastResultList:
            sheet.append(res)
        workbook.save(resultPath)

    def run(self, base_path):
        # base_path = r'D:\jhhuang.Irving\YUN\Study\File\PDF'
        for root, dirs, files in os.walk(base_path):
            # print(files)  # 当前路径下所有非目录子文件
            for file in files:
                path = os.path.join(base_path, file)
                self.CODE = file.split('-')[0]
                self.file_name = file
                print('-----------------当前解析文件-----------------', path)
                if path.endswith('PDF') or path.endswith('pdf'):
                    self.parsePdf(path)
                if path.endswith('DOC') or path.endswith('DOCX') or path.endswith('docx') or path.endswith('doc'):
                    self.fileFlag = 1
                    self.parseWord(path)
                # break
        self.writeToExcel(self.resultList)
        self.wtiteToTxt(r'./errorList.txt', self.errorList)


if __name__ == '__main__':
    t = parseFile()
    # PDF
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000001-平安银行-000001平安银行调研活动信息20210129(1).PDF')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000045-深纺织Ａ-000045深纺织Ａ调研活动信息20210406.PDF')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马Ａ-000050深天马Ａ调研活动信息20200618.pdf')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马Ａ-2018年6月28日投资者关系活动记录表.PDF')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马Ａ-2018年5月31日投资者关系活动记录表.PDF')
    # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马Ａ-2018年11月6日投资者关系活动记录表.PDF')

    # WORD
    # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000001-平安银行-000001平安银行调研活动信息20210129(1).PDF')
    # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF')
    t.run(r'D:\jhhuang.Irving\YUN\Study\File\TEST')





    # 第一个匹配结果超长
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马Ａ-2018年6月28日投资者关系活动记录表.PDF')
weixin_42405511
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
PDF解析

import pdfplumberimport pandas as pdimport redef getQueAndAnw(textBlock): print(textBlock) resultList = [] queLi = [] anwLi = [] textStr = ''.join(textBlock) print(textStr) # preList = re.split(r'？\s+\n|\n\s+\n', textStr).
复制链接

扫一扫