提取表格
#%%从pdf中提取表格
import tabula
df = tabula.read_pdf("17.pdf", pages='all')
表头处理
import pandas as pd
df_result = []
for df_1 in df:
column = df_1.columns.values.tolist()
df_c = pd.DataFrame(column).T ##表头转化为df
df_c.columns=list('ABCDE')
df_1.columns=list('ABCDE')
df_r = df_c.append(df_1)
# df_r = df_r.append(df_r)
# df_c.reset_index(drop=True, inplace=True)
df_r.reset_index(drop=True, inplace=True)
# df_r = df_r.append(df_r)
df_result.append(df_r)
多个表头相同df合并为一个
#%%合并%%
df_z = pd.concat(df_result)
# df_t = df_z.copy(deep=True)
df_z.reset_index(drop=True, inplace=True)
df_t = df_z.copy(deep=True)
df_t = df_t.fillna('')
单元格合并:
1、如果该单元格为空,且上、下单元格不为空,则上单元格内容=上单元格+该单元格;
2、如果该单元格和下一个单元格为空,且上单元和下下单元格不为空,则上单元格内容=上单元格+该单元格+下单元格;
3、。。。。
for index,row in df_t.iterrows():
# print(index)
if index!=0:
if row['C']=='' and df_t['C'][index-1] != '':
## 一行
if df_t['C'][index+1] != '' or index == len(df_t)-1:#下一行不为空或是最后一行
df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]
df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]
##两行
elif df_t['C'][index+1] == '' and df_t['C'][index+2] != '':#
df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]
df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]
##三行
elif df_t['C'][index+1]+df_t['C'][index+2]== '' and df_t['C'][index+3] != '':
df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]+df_t['E'][index+2]
df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]+df_t['A'][index+2]
##四行
elif df_t['C'][index+1]+df_t['C'][index+2]+df_t['C'][index+3] == '' and df_t['C'][index+4] != '':
df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]+df_t['E'][index+2]+df_t['E'][index+3]
df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]+df_t['A'][index+2]+df_t['A'][index+3]
##五行
elif df_t['C'][index+1]+df_t['C'][index+2]+df_t['C'][index+3]+df_t['C'][index+4] == '' and df_t['C'][index+5] != '':
df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]+df_t['E'][index+2]+df_t['E'][index+3]+df_t['E'][index+4]
df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]+df_t['A'][index+2]+df_t['A'][index+3]+df_t['A'][index+4]
删除空值,多余的值
#%%处理删除空值 %%
df_r2 = df_t.copy(deep=True)
df_r2.drop(index=(df_r2.loc[(df_r2['C']=='')].index))