提取PDF文档中的表格转换成Excel表格【已更新】

本文链接：https://blog.csdn.net/Zlb0999_/article/details/143222972

目录

前言

一、准备工作

1、安装相关库文件 2、事先准备一个包含类似以下格式表格的PDF文档，并保存在相应目录中,测试用的PDF文档下载：测试用的PDF文档该示例文档只包含了两个格式相同的表格。

二、设计步骤

1.引入库

2.设计思路

总结

前言

有一个朋友是做跨境电子商务的，开了几个商务网店，每天早上上班的时候，都要求员工从各个商店下载大量商品记录，并进行汇总分析，但下载的都是包含表格的PDF格式，只能手工摘录PDF表格数据保存到一个Excel，这样做费时费力又容易出错，所以精心了个小程序，可以批量提取PDF文档的表格汇总成Excel表格，便于统计、分析。

一、准备工作

1、安装相关库文件

# 需要以下三个库
pip install pdfplumber
pip install openpyxl
pip install pandas

2、事先准备一个包含类似以下格式表格的PDF文档，并保存在相应目录中,测试用的PDF文档下载：测试用的PDF文档该示例文档只包含了两个格式相同的表格。

二、设计步骤

1.引入库

import pdfplumber,openpyxl,os
import pandas as pd
import numpy as np
from openpyxl.styles import Alignment,Color,Border,Side # 导入字体和颜色模块,填充模块
from openpyxl.utils import get_column_letter

2.设计思路

（1）预设单元格居中及边框线条粗细，备用。

alig_center = Alignment(horizontal='center', vertical='center')  # 居中
border = Border(                                                 # 设置表格的边框
    left=Side(border_style='thin', color=Color(rgb='00000000')),
    right=Side(border_style='thin', color=Color(rgb='00000000')),
    top=Side(border_style='thin', color=Color(rgb='00000000')),
    bottom=Side(border_style='thin', color=Color(rgb='00000000')))

（2）读取PDF文档中的所有表格数据，返回一个列表。

def open_pdf_1(url,biao_tou):
    tab_lis,biao_i=[],[]
    with pdfplumber.open(url) as pdf:
        page=pdf.pages
        for pa in page:
            pag=pa.extract_tables()
            for p in pag:
                for sp in p:
                    tab_lis.append(sp)             
    for i , t in enumerate(tab_lis):
        if t[0] is not None and t[0] in biao_tou:
            biao_i.append(i)
    biao_i.append(len(tab_lis))
    biao_0=[tab_lis[biao_i[j]:biao_i[j+1]] for j in range(len(biao_i)-1)]
    return biao_0

（3）将上面读取到的表格数据转换为Excel 表格，保存在一个Excel 工作簿中，PDF 文档中有多个表格的，按顺序分别保存为工作表“表1、表2、表3 .......'。

def to_excels(biao,to_url):
    df_shee=[]
    for sh in biao: 
        df_shee.append(pd.DataFrame(sh))
    with pd.ExcelWriter(to_url, mode='w') as writer:
        for j,s in enumerate(df_shee):
            s.to_excel(writer, sheet_name=f'表_{j+1}', index=False,header=False)

（4）设置好PDF文档及Excel文档保存位置，运行函数 to_excels(biao,to_url)，生成初始Excel表格。

if __name__ == '__main__':
    url=r'0906\biao2.pdf'   # 根据 biao2.pdf 实际保存位置修改为正确的路径
    to_url=r'0906\biao2.xlsx'
    biao_tou=['表格标题']
    biao=open_pdf_1(url,biao_tou)
    to_excels(biao,to_url)

（5 ）生成的Excel表格，格式混乱，没有调整行、列的宽高，还有一些单元格需要合并，接下来会写一段代码来完善这个问题。现将补全后的完整代码发布如下：

import pdfplumber,openpyxl
import pandas as pd
from openpyxl.styles import Alignment,Color,Border,Side # 导入字体和颜色模块,填充模块
from openpyxl.utils import get_column_letter

 # 居中
alig_center = Alignment(horizontal='center', vertical='center')
border = Border(
    left=Side(border_style='thin', color=Color(rgb='00000000')),
    right=Side(border_style='thin', color=Color(rgb='00000000')),
    top=Side(border_style='thin', color=Color(rgb='00000000')),
    bottom=Side(border_style='thin', color=Color(rgb='00000000')))

def to_excels(biao):
    df_shee=[]
    for sh in biao: 
        df_shee.append(pd.DataFrame(sh))
    with pd.ExcelWriter(to_url, mode='w') as writer:
        for j,s in enumerate(df_shee):
            s.to_excel(writer, sheet_name=f'表_{j+1}', index=False,header=False)

def open_pdf_1(url,biao_tou):
    tab_lis,biao_i=[],[]
    with pdfplumber.open(url) as pdf:
        page=pdf.pages
        for pa in page:
            pag=pa.extract_tables()
            for p in pag:
                for sp in p:
                    tab_lis.append(sp)             
    for i , t in enumerate(tab_lis):
        if t[0] is not None and t[0] in biao_tou:
            biao_i.append(i)
    biao_i.append(len(tab_lis))
    biao_0=[tab_lis[biao_i[j]:biao_i[j+1]] for j in range(len(biao_i)-1)]
    return biao_0

def he_bing(k_all):
    wb = openpyxl.load_workbook(to_url)
    sheets=wb.sheetnames
    for she,sk_hebin in zip(sheets,k_all):
        sheet = wb[she]
        nrows = sheet.max_row
        ncols = sheet.max_column
        for s in sk_hebin:
            sheet.merge_cells(s)  # 合并单元格
        for i in range(1, nrows+1):     # 设置居中对齐 前3行自动换行
            sheet.row_dimensions[i].height = 22
            sheet.column_dimensions[get_column_letter(i)].width = 14 # 动态设置列宽
            for j in range(1, ncols+1):
                sheet.cell(i,j).alignment = alig_center # 居中
                sheet.cell(i,j).border = border  # 边框
    wb.save(to_url)

def heng_hebin(lisg):
    lis_3 = []
    for k,ks in enumerate(lisg):
        lis_2=[]
        for j,ss in enumerate(ks):
            if ss is not None: 
                lis_2.append(j)
        lis_2.append(len(lisg[0]))
        for i in range(len(lis_2)-1):
            if lis_2[i+1]-lis_2[i]>1:
                if k in [0,1] or (k in [2] and lis_2[i]>3) or (k in [7,8] and lis_2[i]<6):
                    lis_3.append([lis_2[i],k+1,lis_2[i+1]-1,k+1])
    return lis_3

def shu_hebin(lisg):
    lis_3=[]
    for k in range(len(lisg[0])):
        lis_1=[s[k] for s in lisg]
        lis_2=[]
        for j,ss in enumerate(lis_1):
            if ss is not None:
                lis_2.append(j)
        lis_2.append(len(lis_1))
        for i in range(len(lis_2)-1):
            if lis_2[i+1]-lis_2[i]>1:
                if (k in [1] and lis_2[i] in [1,2,3,4,5]) or (k in [7,9] and lis_2[i] in [7,9]):
                    lis_3.append([k,lis_2[i]+1,k,lis_2[i+1]])
    return lis_3

def run_hen_shu(lisg):
    list_0=[]
    for ss in lisg:
        shu=shu_hebin(ss)
        hen=heng_hebin(ss)
        henshu=[[5,4,8,7]]
        list_0.append(shu+hen+henshu)
    return list_0

def run_1(url,biao):
    lis=open_pdf_1(url,biao)
    he_bin_geshi=run_hen_shu(lis)
    to_excels(lis)
    sk_k_all=[]
    for he_geshi in he_bin_geshi:
        sk_k=[abc[s[0]]+str(s[1])+':'+abc[s[2]]+str(s[3]) for s in he_geshi]
        sk_k_all.append(sk_k)
    he_bing(sk_k_all) 

if __name__ == '__main__':
    abc=['A','B','C','D','E','F','G','H','I','J']
    url_0=r'D:\pdf_xlsx\pdf_f\biao2.pdf'
    to_url=url_0.replace('.pdf','_1.xlsx')
    to_url_1=url_0.replace('.pdf','_1_1.xlsx')
    biao_tou=['表格标题']
    run_1(url_0,biao_tou)