import re import nltk import pdfplumber from PyPDF2 import PdfFileReader from txtai.pipeline import Textractor class PDFanalysis: def __init__(self,path): self.pdf_path=path self.pypdf2_pdf_run(self.pdf_path) def ppr_analysis_pdf(self,path): pdf = pdfplumber.open(path) tables=[] for page in pdf.pages: for table in page.extract_table(): tables.append(table) pdf.close() return tables def txtai_analysis_pdf(self,path): textractor=Textractor(sentences=True) data=textractor(path) return data def pypdf2_analysis_paf_show(self,path,number,output): datas = [] sents = [] reader = PdfFileReader(path) page = reader.getPage(number) page_text = page.extractText() if output==0: print("完整处理 PDF .....","\n") page_split = page_text.split("\n") new_list=[] for index,sent1 in enumerate(page_split[:-1]): sent2=page_split[index+1] FTPwords = re.findall(r"^Fig.|^tabel.|^Page ", sent2, re.IGNORECASE) Cwords = re.findall(r"[* ]Correspondence:", sent1, re.IGNORECASE) if len(sent1)>=45 and len(sent1)+len(sent2)>=100 and not FTPwords and not Cwords: new_list.append(sent1) else: new_list.append(sent1+"\n") new_list.append(page_split[-1]) text="".join(i for i in new_list) text=self.pre_split_sign_text(text) print(text) elif output==1: print("预处理 PDF 长度+解析......","\n") sign_text= self.pre_split_sign_text(page_text) page_split = sign_text.split("\n") for sent in page_split: sents.append((len(sent), sent)) for ln, sent in sents: print("len:", ln) print("sent:", sent) elif output==2: datas.append(page_text) print("解析 PDF ......","\n") #print(page_text) print(datas) elif output==3: print("预处理pdf...","\n") sign_text=self.pre_split_sign_text(page_text) print(sign_text) def pre_split_sign_text(self,text): text = self.pre_split_str_text(text, r"[(][\w]+\n","\n"," ") #特殊单词 text = self.pre_split_str_text(text, r"[( ][\w]+\n\d\w\d","\n","") text = self.pre_split_str_text(text, r"[( ][\w]+\n\d[+-]","\n","") text = self.pre_split_str_text(text, r"[( ]C\nmax[ ]|[( ]C\nmin[ ]","\n","") text = self.pre_split_str_text(text, r"[( ]DL\nCO[ ]|[( ]CL\nCR[ ]","\n","") sents = self.pre_split_str_text(text, r"\xa0 cm\n[−+]\d\s","\n","") # ...BMC Cancer, 13, 326. [PubMed: 23819905] \nAndres SA &... sents = sents.replace("] \n", "]\n") sents = sents.replace(" \x0b", " ") # \xad connect_sents = sents.split("\xad\n") sents = "".join(i for i in connect_sents) # \s sents = sents.replace(" \n", "\n") connect_words = sents.split(" \n") sents = " ".join(i for i in connect_words) # () connect_sents = sents.split("(\n") sents = "(".join(i for i in connect_sents) connect_sents = sents.split(")\n") sents = ")".join(i for i in connect_sents) # 特殊符号 connect_sents = sents.split("\n™") sents = "™".join(i for i in connect_sents) connect_sents = sents.split("\n®") sents = "®".join(i for i in connect_sents) connect_sents = sents.split("\n®") sents = "®".join(i for i in connect_sents) # - sents = sents.replace("--\n", "~~\n") relu = re.compile(r"\n[-]\n|[-]\n| [-]\n") rlist = relu.split(sents) sents = "".join(i for i in rlist) # , relu = re.compile(r"[,]\n|\n[,]") rlist = relu.split(sents) sents = ",".join(i for i in rlist) # ; relu = re.compile(r"[;]\n|\n[;]") rlist = relu.split(sents) sents = ";".join(i for i in rlist) # : relu = re.compile(r"[:]\n|\n[:]") rlist = relu.split(sents) sents = ";".join(i for i in rlist) relu = re.compile(r"[/]\n|\n[/]") rlist = relu.split(sents) sents = "/".join(i for i in rlist) # sents = sents.replace(".\n", ".\n ") sents = sents.replace("\n", " \n") # 关键词 sents = sents.replace(" NSCLC\npatient", " NSCLC patient") sents = sents.replace("AbstractBackground: ", "Abstract\nBackground: ") sents = sents.replace("© The Author(s).", "\n© The Author(s).") sents = sents.replace("* Correspondence:", "\n* Correspondence:") sents = sents.replace("*Correspondence", "\n*Correspondence") sents = sents.replace("INTRODUCTION", "\nINTRODUCTION\n") sents = sents.replace(".DATA SHARING", ".\nDATA SHARING\n") sents = sents.replace(".REFERENCES", ".\nREFERENCES\n") return sents def pre_split_str_text(self,text,rerelu,invalue,outvalue): relu = re.compile(rerelu) words = relu.findall(text) sents = relu.split(text) if words: word_list = [] for word in words: word_list.append(str(word).replace(invalue,outvalue)) text_list = [] for index, sent in enumerate(sents[:-1]): text_list.append(sent + word_list[index]) retext = "".join(i for i in text_list) return retext else: return text def split_paras(self,text): return [i for i in re.split(r'[\n]', text)] def split_long_sents_re(self,text): relu = re.compile(r"[.。??!!]") words = relu.findall(text) sents = relu.split(text) text_list = [] for index, sent in enumerate(sents[:-1]): text_list.append(sent + words[index]) return text_list def split_long_sents_nltk(self,text): sent=nltk.tokenize.sent_tokenize(text) return sent def clear_read_data(self,data): data_list=self.split_paras(data) sents_list=[] for i in data_list: relu = re.compile(".") words = relu.findall(i) if len(words)<=1: sents_list.extend(i) else: #cache_list=split_long_sents_re(i) cache_list = self.split_long_sents_nltk(i) sents_list.extend(cache_list) return sents_list def pypdf2_pdf_run(self,path,is_join=False): reader = PdfFileReader(path) page_count = reader.getNumPages() texts = [] for i in range(page_count): page = reader.getPage(i) page_text = page.extractText() page_split = page_text.split("\n") new_list = [] for index, sent1 in enumerate(page_split[:-1]): if is_join: sent2 = page_split[index + 1] FTPwords = re.findall(r"^Fig.|^tabel.|^Page ", sent2, re.IGNORECASE) Cwords = re.findall(r"[* ]Correspondence:", sent1, re.IGNORECASE) if len(sent1) >= 45 and len(sent1) + len(sent2) >= 100 and not FTPwords and not Cwords: new_list.append(sent1) else: new_list.append(sent1 + "\n") else: new_list.append(sent1) new_list.append(page_split[-1]+"\n") text = "".join(i for i in new_list) text = self.pre_split_sign_text(text) texts.append(text) datas = "".join(i for i in texts) pdf_data = self.clear_read_data(datas) return pdf_data if __name__ == '__main__': path = "../../datasets/pdf/A001.pdf" model=PDFanalysis(path) print(model) """ 测试36篇 8篇长句问题,即"\n"问题 24篇调式完毕,格式原因,小部分标题与段落组合,解析顺序不一致 """
PDF解析
最新推荐文章于 2023-11-05 12:07:46 发布
该代码实现了一个名为PDFanalysis的类,用于处理PDF文件。它包含了多个方法,如使用pdfplumber和PyPDF2库提取表格和文本,使用txtai进行句子提取,以及预处理文本,包括分割特殊符号、调整换行等。此外,还提供了拆分长句和清除数据的辅助方法。
摘要由CSDN通过智能技术生成