python表格合并处理

 提取表格

#%%从pdf中提取表格
import tabula

df = tabula.read_pdf("17.pdf", pages='all')

表头处理

import pandas as pd
df_result = []
for df_1 in df:
    column = df_1.columns.values.tolist()
    df_c = pd.DataFrame(column).T ##表头转化为df
    df_c.columns=list('ABCDE')
    df_1.columns=list('ABCDE')
    df_r = df_c.append(df_1)
    # df_r = df_r.append(df_r)
    # df_c.reset_index(drop=True, inplace=True)
    df_r.reset_index(drop=True, inplace=True)
    # df_r = df_r.append(df_r)
    
    df_result.append(df_r)

多个表头相同df合并为一个

 #%%合并%%   
df_z = pd.concat(df_result) 
# df_t = df_z.copy(deep=True) 
df_z.reset_index(drop=True, inplace=True)
df_t = df_z.copy(deep=True)   
df_t = df_t.fillna('')  

 单元格合并:

1、如果该单元格为空,且上、下单元格不为空,则上单元格内容=上单元格+该单元格;

2、如果该单元格和下一个单元格为空,且上单元和下下单元格不为空,则上单元格内容=上单元格+该单元格+下单元格;

3、。。。。

for index,row in df_t.iterrows():
    # print(index)
    if index!=0:
        
        if row['C']=='' and df_t['C'][index-1] != '':
            ## 一行
            if df_t['C'][index+1] != '' or index == len(df_t)-1:#下一行不为空或是最后一行 
                df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]
                df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]
            ##两行     
            elif df_t['C'][index+1] == '' and  df_t['C'][index+2] != '':#
                df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]
                df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]
            ##三行 
            elif df_t['C'][index+1]+df_t['C'][index+2]== '' and  df_t['C'][index+3] != '':
                df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]+df_t['E'][index+2]
                df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]+df_t['A'][index+2]
            ##四行
            elif df_t['C'][index+1]+df_t['C'][index+2]+df_t['C'][index+3] == '' and df_t['C'][index+4] != '':
                df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]+df_t['E'][index+2]+df_t['E'][index+3]
                df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]+df_t['A'][index+2]+df_t['A'][index+3]
            ##五行
            elif df_t['C'][index+1]+df_t['C'][index+2]+df_t['C'][index+3]+df_t['C'][index+4] == '' and df_t['C'][index+5] != '':
                df_t['E'][index-1] = df_t['E'][index-1]+df_t['E'][index]+df_t['E'][index+1]+df_t['E'][index+2]+df_t['E'][index+3]+df_t['E'][index+4]
                df_t['A'][index-1] = df_t['A'][index-1]+df_t['A'][index]+df_t['A'][index+1]+df_t['A'][index+2]+df_t['A'][index+3]+df_t['A'][index+4]

 删除空值,多余的值

#%%处理删除空值 %%
df_r2 = df_t.copy(deep=True) 
df_r2.drop(index=(df_r2.loc[(df_r2['C']=='')].index))    

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值