以前做过的一个项目,需要从pdf格式的检查报告提取关键信息到Execl中
import numpy as np
import pandas as pd
import re
import pdfplumber
import os
from tqdm import tqdm
pd.set_option('max_row',None)
pd.set_option('max_columns',None)
a = 0
xuejian_list = []
xuejian_id_list = []
#shunde_file = []
#shunde_id_file = []
#读取路径下的所有pdf文件
for root,dirs,files in os.walk(r'C:\Users\XX'):
for file1 in files:
if file1.endswith('.pdf') or file1.endswith('.PDF'):
file2 = re.search('(.+?).[pdf,PDF]',file1)
xuejian_id_list.append(file2.group(1))
file = os.path.join(root,file1)
xuejian_list.append(file)
'''if file2.group(1) in shunde_id_list:
shunde_file.append(file)
shunde_id_file.append(file2.group(1))'''
#print(file)
#print(xuejian_id_list)
num = len(xuejian_list)
def pdf_to_excel(file):
pdf = pdfplumber.open(file)
a = 0
dist = {}
dist['id'] = xuejian_id_list[x]
pdf_table_path = pd.DataFrame(None)
for page in pdf.pages:
a = a + 1
if a == 1:
#print(page.extract_text())
pdf_text = page.extract_text()
#诊断结果1
result1 = re.search('结果[:,:,:,\s]{1,}(.+?)[\。]{0,}[\n,\s]',pdf_text)
if result1 != None:
l5 = result1.group(1)
dist['result1'] = l5
else:
result1 = re.search('结果[:,:,\s]{0,}(.+?)\。',pdf_text)
l5 = result1.group(1)
dist['result1'] = l5
#诊断结果2,无规则可循,考虑其他读取pdf包
result2 = re.search('(本次[\s\S]+?)[\n,\s]{0,}主[\s]{0,}治[\s]{0,}医[\s]{0,}生',pdf_text)
if result2 != None:
l6 = result2.group(1).replace('\s','').replace('\n','')
dist['result2'] = l6
else:
result2 = re.search('(本次[\s\S]+?)[\n,\s]{0,}检[\s]{0,}验[\s]{0,}者',pdf_text)
l6 = result2.group(1).replace('\s','').replace('\n','')
l66 = re.search('(.+\。)',l6)
dist['result2'] = l66.group(1)
dist_new = pd.DataFrame.from_dict(dist,orient='index').T
#print(dist_new)
#表格提取为pd.DataFrame
for pdf_table in page.extract_tables():
pdf_table = np.array(pdf_table)
pdf_table = pd.DataFrame(pdf_table[1:],columns=pdf_table[0])
if str(pdf_table_path.empty) == 'False':
pdf_table = pd.concat([pdf_table_path,pdf_table],axis=0)
pdf_table_path = pdf_table.copy()
#print(pdf_table)
#print('/n')
pdf_table = pdf_table.reset_index(drop=True)
pdf_table1 = pd.DataFrame(pdf_table['检测结果'].values)
pdf_table1.index = pd.Series(pdf_table['英文缩写'].values)
#pdf_table2 = pd.pivot_table(pdf_table1,columns=['英文缩写'],values=['检测结果'],aggfunc=[np.sum],fill_value=np.nan)
#数据合并
pdf_data = pd.concat([dist_new,pdf_table1.T],axis=1)
#print(pdf_data)
#pdf_data_columns = pdf_data.columns.values.tolist()
pdf.close()
return pdf_data
#xuejian_dict = {}
xuejian_path = pd.DataFrame(None)
false_file = []
for x in tqdm(range(len(xuejian_list))):
try:
#xuejian_dict['id'] = xuejian_id_list[x]
pdf_data = pdf_to_excel(xuejian_list[x])
if str(xuejian_path.empty) == 'False':
pdf_data = pd.concat([xuejian_path,pdf_data],axis=0)
xuejian_path = pdf_data.copy()
except:
false_file.append(xuejian_list[x])
pdf_data1 = pdf_data.reset_index(drop=True)
pdf_data1.to_csv(r'xx.csv',encoding="gbk",index=False)
false_file1 = pd.DataFrame(false_file,columns=['file_name'])
false_file1.to_csv(r'xx',encoding='gbk',index=False)
print("总pdf文件数:{}".format(len(xuejian_list)))
print("已提取的pdf文件数:{}".format(len(pdf_data1)))