import pdfplumber import re import os import pandas as pd import openpyxl as xl from docx import Document class parseFile(): def __init__(self): self.chapter_regex = [ r'\s*\d+\.\s+(.*?)?\s+', r'\s+\d+:(.*?)?\s+', r'\s+\d+、(.*?)答:', r'\s*[一二三四五六七八九十]+、(.*?)\s+', r'\s*问:\s*(.*?)答:', r'\s*问题\d+:\s*(.*?)回复:', r'\s*\d+、\s*(.*?)\s+' ] self.doc_regex = [ r'\s+(.*?)?' ] self.fileFlag = 0 self.CODE = '' self.anwTime = '' self.file_name = '' self.resultList = [] self.errorList = [] def getRegexRes(self, preList): preLi = [] for preres in preList: for regex_str in self.chapter_regex: pattern = re.compile(regex_str, re.DOTALL) preLi = re.findall(pattern, preres) if not preLi: continue print('----------------正则匹配结果:') print(preLi) for id, i in enumerate(preLi): print(id + 1, i) if not preLi: return preLi else: self.getRegexRes(preLi) def getIndex(self, preLi, preStr): for index, row in enumerate(preLi): if preStr in row: return index def getQueAndAnw(self, textBlock): print(textBlock) regexList = [] queLi = [] anwLi = [] # textStr = '{]'.join(textBlock) textStr = '\t'.join(textBlock) print(textStr) if self.fileFlag: regexList = self.doc_regex else: regexList = self.chapter_regex for regex_str in regexList: pattern = re.compile(regex_str, re.DOTALL) preLi = re.findall(pattern, textStr) if not preLi: continue for id, row in enumerate(preLi): row = row.split('提问及回复')[1] if '提问及回复' in row else row if '\t' in row: leftStr = row.split('\t\t')[0] rightStr = row.split('\t\t')[1] if '?' not in rightStr: row = leftStr queLi.append(row) break # 打印结果 for id, row in enumerate(queLi): print('-----第{}个匹配结果-----'.format(id + 1)) print(row) if id + 1 == len(queLi): anwserRes = textStr.split(queLi[id])[1] print('-----答案提取结果-----\n{}'.format(anwserRes)) anwLi.append(anwserRes) else: # startId = self.getIndex(textBlock, queLi[id]) # endId = self.getIndex(textBlock, queLi[id+1]) # if startId + 2 == endId: # anwserRes = textBlock[startId+1] anw_pattern = re.compile(r'{}(.*?){}'.format(re.escape(queLi[id]), re.escape(queLi[id+1])), re.DOTALL) anwserRes = re.findall(anw_pattern, textStr) if anwserRes: anwserStr = anwserRes[0] print('-----答案提取结果-----\n{}'.format(anwserStr)) anwLi.append(anwserStr) else: print('第{}个未匹配上'.format(id + 1)) if len(queLi) == len(anwLi) and len(queLi) != 0: print('提取成功!') for row in zip(queLi, anwLi): self.resultList.append([self.CODE, self.anwTime, self.anwTime, row[0].strip().replace('\n', ''), row[1].strip().replace('\n', '').replace('?', '').replace('{]', ''), self.file_name]) else: print('该附件提取存在问题:{}'.format(self.file_name)) self.errorList.append(self.file_name) def wtiteToTxt(self, path, resultList): with open(path, 'a+') as f: for row in resultList: f.write(row + '\n') def parseTable(self, tableList): try: timeStr = '' for index, row in enumerate(tableList): if not row: continue if '投资者关系活动' and '内容介绍' in row.replace('\n', '').replace('\r', '').replace('\t', ''): # print(index, row) startIndex = index if '附件清单' in row.replace('\n', '').replace('\r', '').replace('\t', ''): # print(index, row) endIndex = index break if '时间' in row.replace('\n', '').replace('\r', '').replace('\t', '').replace(' ', '') and not timeStr: timeStr = tableList[index + 1] # print(timeStr) if timeStr: try: pre_anw_time = re.findall(r'(\d+)年\s*(\d+)\s*月\s*(.*?)日', timeStr) if len(pre_anw_time[0]) == 3: pre_list = [row.split('-')[0] if '-' in row else row for row in pre_anw_time[0]] self.anwTime = '-'.join(pre_list) except Exception as e: print(e) self.anwTime = '' print('正则匹配日期异常') textBlock = tableList[startIndex + 1:endIndex] # print(textBlock) self.getQueAndAnw(textBlock) except Exception as e: print(e) print('该附件不满足条件,请检查') def parsePdf(self, path): ''' 函数入口 遍历文件,将所有数据合并到一个list :param path: :return: ''' tableList = [] try: with pdfplumber.open(path) as f: for page in f.pages: # page:每一页数据 for table in page.extract_tables(): # print(table) # table:每一个表格 for li in table: # print(li) # 每一行数据 for row in li: # preLi = row.split() tableList.append(row) print(tableList) self.parseTable(tableList) except Exception as e: print(e) print('--------------该附件无法打开') self.errorList.append('无法打开' + self.file_name) def parseWord(self, path): tableList = [] doc = Document(path) tbs = doc.tables for tb in tbs: for row in tb.rows: for cell in row.cells: tableList.append(cell.text) # print(cell.text) # 自动编号判断标志 # num_flag = # print(tableList) self.parseTable(tableList) def writeToExcel(self, lastResultList): # 写入结果文件 resultPath = './researchExcel.xlsx' workbook = xl.Workbook() workbook.save(resultPath) sheet = workbook.active # 添加标题栏 headers = ['CODE', u'提问时间', u'回答时间', u'问题内容', u'回答', u'解析文件名'] sheet.append(headers) for res in lastResultList: sheet.append(res) workbook.save(resultPath) def run(self, base_path): # base_path = r'D:\jhhuang.Irving\YUN\Study\File\PDF' for root, dirs, files in os.walk(base_path): # print(files) # 当前路径下所有非目录子文件 for file in files: path = os.path.join(base_path, file) self.CODE = file.split('-')[0] self.file_name = file print('-----------------当前解析文件-----------------', path) if path.endswith('PDF') or path.endswith('pdf'): self.parsePdf(path) if path.endswith('DOC') or path.endswith('DOCX') or path.endswith('docx') or path.endswith('doc'): self.fileFlag = 1 self.parseWord(path) # break self.writeToExcel(self.resultList) self.wtiteToTxt(r'./errorList.txt', self.errorList) if __name__ == '__main__': t = parseFile() # PDF # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000001-平安银行-000001平安银行调研活动信息20210129(1).PDF') # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000045-深纺织A-000045深纺织A调研活动信息20210406.PDF') # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-000050深天马A调研活动信息20200618.pdf') # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年6月28日投资者关系活动记录表.PDF') # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年5月31日投资者关系活动记录表.PDF') # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年11月6日投资者关系活动记录表.PDF') # WORD # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000001-平安银行-000001平安银行调研活动信息20210129(1).PDF') # t.run(r'D:\jhhuang.Irving\YUN\Study\File\PDF') t.run(r'D:\jhhuang.Irving\YUN\Study\File\TEST') # 第一个匹配结果超长 # parsePdf(r'D:\jhhuang.Irving\YUN\Study\File\PDF\000050-深天马A-2018年6月28日投资者关系活动记录表.PDF')
PDF解析
最新推荐文章于 2024-02-28 19:54:04 发布