PDF解析

import pdfplumber
import re
import os
import pandas as pd
import openpyxl as xl
from docx import Document


class parseFile():
    def __init__(self):
        self.chapter_regex = [
            r'\s*\d+\.\s+(.*?)?\s+',
            r'\s+\d+:(.*?)?\s+',
            r'\s+\d+、(.*?)答:',
            r'\s*[一二三四五六七八九十]+、(.*?)\s+',
            r'\s*问:\s*(.*?)答:',
            r'\s*问题\d+:\s*(.*?)回复:',
            r'\s*\d+、\s*(.*?)\s+'
        ]
        self.doc_regex = [
            r'\s+(.*?)?'
        ]
        self.fileFlag = 0
        self.CODE = ''
        self.anwTime = ''
        self.file_name = ''
        self.resultList = []
        self.errorList = []

    def getRegexRes(self, preList):
        preLi = []
        for preres in preList:
            for regex_str in self.chapter_regex:
                pattern = re.compile(regex_str, re.DOTALL)
                preLi = re.findall(pattern, preres)
                if not preLi:
                    continue
                print('----------------正则匹配结果:')
                print(preLi)
                for id, i in enumerate(preLi):
                    print(id + 1, i)
            if not preLi:
                return preLi
            else:
                self.getRegexRes(preLi)

    def getIndex(self, preLi, preStr):
        for index, row in enumerate(preLi):
            if preStr in row:
                return index

    def getQueAndAnw(self, textBlock):
        print(textBlock)
        regexList = []
        queLi = []
        anwLi = []
        # textStr = '{]'.join(textBlock)
        textStr = '\t'.join(textBlock)
        print(textStr)
        if self.fileFlag:
            regexList = self.doc_regex
        else:
            regexList = self.chapter_regex
        for regex_str in regexList:
            pattern = re.compile(regex_str, re.DOTALL)
            preLi = re.findall(pattern, textStr)
            if not preLi:
                continue
            for id, row in enumerate(preLi):
                row = row.split('提问及回复')[1] if '提问及回复' in row else row
                if '\t' in row:
                    leftStr = row.split('\t\t')[0]
                    rightStr = row.split('\t\t')[1]
                    if '?' not in rightStr:
                        row = leftStr
                queLi.append(row)
            break

        # 打印结果
        for id, row in enumerate(queLi):
            print('-----第{}个匹配结果-----'.format(id + 1))
            print(row)

            if id + 1 == len(queLi):
                anwserRes = textStr.split(queLi[id])[1]
                print('-----答案提取结果-----\n{}'.format(anwserRes))
                anwLi.append(anwserRes)
            else:
                # startId = self.getIndex(textBlock, queLi[id])
                # endId = self.getIndex(textBlock, queLi[id+1])
                # if startId + 2 == endId:
                #     anwserRes = textBlock[startId+1]
                anw_pattern = re.compile(r'{}(.*?){}'.format(re.escape(queLi[id]), re.escape(queLi[id+1])), re.DOTALL)
                anwserRes = re.findall(anw_pattern, textStr)
                if anwserRes:
                    anwserStr = anwserRes[0]
                    print('-----答案提取结果-----\n{}'.format(anwserStr))
                    anwLi.append(anwserStr)
                else:
                    print('第{}个未匹配上'.format(id + 1))

        if len(queLi) == len(anwLi) and len(queLi) != 0:
            print('提取成功!')
            for row in zip(queLi, anwLi):
                self.resultList.append([self.CODE, self.anwTime, self.anwTime, row[0].strip().replace('\n', ''), row[1].strip().replace('\n', '').replace('?', '').replace('{]', ''), self.file_name])
        else:
            print('该附件提取存在问题:{}'.format(self.file_name))
            self.errorList.append(self.file_name)

    def wtiteToTxt(self, path, resultList):
        with open(path, 'a+') as f:
            for row in resultList:
                f.write(row + '\n')

    def parseTable(self, tableList):
        try:
            timeStr = ''
            for index, row in enumerate(tableList):
                if not row:
                    continue
                if '投资者关系活动' and '内容介绍' in row.replace('\n', '').replace('\r', '').replace('\t', ''):
                    # print(index, row)
                    startIndex = index
                if '附件清单' in row.replace('\n', '').replace('\r', '').replace('\t', ''):
                    # print(index, row)
                    endIndex = index
                    break
                if '时间' in row.replace('\n', '').replace('\r', '').replace('\t', '').replace(' ', '') and not timeStr:
                    timeStr = tableList[index + 1]
                    # print(timeStr)

            if timeStr:
                try:
                    pre_anw_time = re.findall(r'(\d+)年\s*(\d+)\s*月\s*(.*?)日', timeStr)
                    if len(pre_anw_time[0]) == 3:
                        pre_list = [row.split('-')[0] if '-' in row else row for row in pre_anw_time[0]]
                        self.anwTime = '-'.join(pre_list)

                except Exception as e:
                    print(e)
                    self.anwTime = ''
                    print('正则匹配日期异常')

            textBlock = tableList[startIndex + 1:endIndex]
            # print(textBlock)
            self.getQueAndAnw(textBlock)

        except Exception as e:
            print(e)
            print('该附件不满足条件,请检查')

    def parsePdf(self, path):
        '''
        函数入口
        遍历文件,将所有数据合并到一个list
        :param path:
        :return:
        '''
        tableList = []
        try:
            with pdfplumber.open(path) as f:
                for page in f.pages:
                    # page:每一页数据
                    for table in page.extract_tables():
                        # print(table)
                        # table:每一个表格
                        for li in table:
                            # print(li)
                            # 每一行数据
                            for row in li:
                                # preLi = row.split()
                                tableList.append(row)

            print(tableList)
            self.parseTable(tableList)
        except Exception as e:
            print(e)
            print('--------------该附件无法打开')
            self.errorList.append('无法打开' + self.file_name)

    def parseWord(self, path):
        tableList = []
        doc = Document(path)
        tbs = doc.tables
        for tb in tbs:
            for row in tb.rows:
                for cell in row.cells:
                    tableList.append(cell.text)
                    # print(cell.text)
                    # 自动编号判断标志
                    # num_flag =
        # print(tableList)
        self.parseTable(tableList)

    def writeToExcel(self, lastResultList):
        # 写入结果文件
        resultPath = './researchExcel.xlsx'
        workbook = xl.Workbook()
        workbook.save(resultPath)
        sheet = workbook.active
        # 添加标题栏
        headers = ['CODE', u'提问时间', u'回答时间', u'问题内容', u'回答', u'解析文件名']
        sheet.append(headers)
        for res in lastResultList:
            sheet.append(res)
        workbook.save(resultPath)

    def run(self, base_path):
        # base_path = r'D:\jhhuang.Irving\YUN\Study\File\PDF'
        for root, dirs, files in os.walk(base_path):
            # print(files)  # 当前路径下所有非目录子文件
            for file in files:
                path = os.path.join(base_path, file)
                self.CODE = file.split('-')[0]
                self.file_name = file
                print('-----------------当前解析文件-----------------', path)
                if path.endswith('PDF') or path.endswith('pdf'):
                    self.parsePdf(path)
                if path.endswith('DOC') or path.endswith('DOCX') or path.endswith('docx') or path.endswith('doc'):
                    self.fileFlag = 1
                    self.parseWord(path)
                # break
        self.writeToExcel(self.resultList)
        self.wtiteToTxt(r'./errorList.txt', self.errorList)


if __name__ == '__main__':
    t = parseFile()
    # PDF
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000001-平安银行-000001平安银行调研活动信息20210129(1).PDF')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000045-深纺织A-000045深纺织A调研活动信息20210406.PDF')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-000050深天马A调研活动信息20200618.pdf')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年6月28日投资者关系活动记录表.PDF')
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年5月31日投资者关系活动记录表.PDF')
    # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年11月6日投资者关系活动记录表.PDF')

    # WORD
    # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000001-平安银行-000001平安银行调研活动信息20210129(1).PDF')
    # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF')
    t.run(r'D:\jhhuang.Irving\YUN\Study\File\TEST')





    # 第一个匹配结果超长
    # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年6月28日投资者关系活动记录表.PDF')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值