目录
1、安装相关库文件2、事先准备一个包含类似以下格式表格的PDF文档,并保存在相应目录中,测试用的PDF文档下载:测试用的PDF文档 该示例文档只包含了两个格式相同的表格。
前言
有一个朋友是做跨境电子商务的,开了几个商务网店,每天早上上班的时候,都要求员工从各个商店下载大量商品记录,并进行汇总分析,但下载的都是包含表格的PDF格式,只能手工摘录PDF表格数据保存到一个Excel,这样做费时费力又容易出错,所以精心了个小程序,可以批量提取PDF文档的表格汇总成Excel表格,便于统计、分析。
一、准备工作
1、安装相关库文件
# 需要以下三个库
pip install pdfplumber
pip install openpyxl
pip install pandas
2、事先准备一个包含类似以下格式表格的PDF文档,并保存在相应目录中,测试用的PDF文档下载:测试用的PDF文档 该示例文档只包含了两个格式相同的表格。
二、设计步骤
1.引入库
import pdfplumber,openpyxl,os
import pandas as pd
import numpy as np
from openpyxl.styles import Alignment,Color,Border,Side # 导入字体和颜色模块,填充模块
from openpyxl.utils import get_column_letter
2.设计思路
(1)预设单元格居中及边框线条粗细,备用。
alig_center = Alignment(horizontal='center', vertical='center') # 居中
border = Border( # 设置表格的边框
left=Side(border_style='thin', color=Color(rgb='00000000')),
right=Side(border_style='thin', color=Color(rgb='00000000')),
top=Side(border_style='thin', color=Color(rgb='00000000')),
bottom=Side(border_style='thin', color=Color(rgb='00000000')))
(2) 读取PDF文档中的所有表格数据,返回一个列表。
def open_pdf_1(url,biao_tou):
tab_lis,biao_i=[],[]
with pdfplumber.open(url) as pdf:
page=pdf.pages
for pa in page:
pag=pa.extract_tables()
for p in pag:
for sp in p:
tab_lis.append(sp)
for i , t in enumerate(tab_lis):
if t[0] is not None and t[0] in biao_tou:
biao_i.append(i)
biao_i.append(len(tab_lis))
biao_0=[tab_lis[biao_i[j]:biao_i[j+1]] for j in range(len(biao_i)-1)]
return biao_0
(3)将上面读取到的表格数据转换为Excel 表格,保存在一个Excel 工作簿中,PDF 文档中有多个表格的,按顺序分别保存为工作表“表1、表2、表3 .......'。
def to_excels(biao,to_url):
df_shee=[]
for sh in biao:
df_shee.append(pd.DataFrame(sh))
with pd.ExcelWriter(to_url, mode='w') as writer:
for j,s in enumerate(df_shee):
s.to_excel(writer, sheet_name=f'表_{j+1}', index=False,header=False)
(4)设置好PDF文档及Excel文档保存位置,运行 函数 to_excels(biao,to_url),生成初始Excel表格。
if __name__ == '__main__':
url=r'0906\biao2.pdf' # 根据 biao2.pdf 实际保存位置修改为正确的路径
to_url=r'0906\biao2.xlsx'
biao_tou=['表格标题']
biao=open_pdf_1(url,biao_tou)
to_excels(biao,to_url)
(5 )生成的Excel表格,格式混乱,没有调整行、列的宽高,还有一些单元格需要合并,接下来会写一段代码来完善这个问题。现将补全后的完整代码发布如下:
import pdfplumber,openpyxl
import pandas as pd
from openpyxl.styles import Alignment,Color,Border,Side # 导入字体和颜色模块,填充模块
from openpyxl.utils import get_column_letter
# 居中
alig_center = Alignment(horizontal='center', vertical='center')
border = Border(
left=Side(border_style='thin', color=Color(rgb='00000000')),
right=Side(border_style='thin', color=Color(rgb='00000000')),
top=Side(border_style='thin', color=Color(rgb='00000000')),
bottom=Side(border_style='thin', color=Color(rgb='00000000')))
def to_excels(biao):
df_shee=[]
for sh in biao:
df_shee.append(pd.DataFrame(sh))
with pd.ExcelWriter(to_url, mode='w') as writer:
for j,s in enumerate(df_shee):
s.to_excel(writer, sheet_name=f'表_{j+1}', index=False,header=False)
def open_pdf_1(url,biao_tou):
tab_lis,biao_i=[],[]
with pdfplumber.open(url) as pdf:
page=pdf.pages
for pa in page:
pag=pa.extract_tables()
for p in pag:
for sp in p:
tab_lis.append(sp)
for i , t in enumerate(tab_lis):
if t[0] is not None and t[0] in biao_tou:
biao_i.append(i)
biao_i.append(len(tab_lis))
biao_0=[tab_lis[biao_i[j]:biao_i[j+1]] for j in range(len(biao_i)-1)]
return biao_0
def he_bing(k_all):
wb = openpyxl.load_workbook(to_url)
sheets=wb.sheetnames
for she,sk_hebin in zip(sheets,k_all):
sheet = wb[she]
nrows = sheet.max_row
ncols = sheet.max_column
for s in sk_hebin:
sheet.merge_cells(s) # 合并单元格
for i in range(1, nrows+1): # 设置居中对齐 前3行自动换行
sheet.row_dimensions[i].height = 22
sheet.column_dimensions[get_column_letter(i)].width = 14 # 动态设置列宽
for j in range(1, ncols+1):
sheet.cell(i,j).alignment = alig_center # 居中
sheet.cell(i,j).border = border # 边框
wb.save(to_url)
def heng_hebin(lisg):
lis_3 = []
for k,ks in enumerate(lisg):
lis_2=[]
for j,ss in enumerate(ks):
if ss is not None:
lis_2.append(j)
lis_2.append(len(lisg[0]))
for i in range(len(lis_2)-1):
if lis_2[i+1]-lis_2[i]>1:
if k in [0,1] or (k in [2] and lis_2[i]>3) or (k in [7,8] and lis_2[i]<6):
lis_3.append([lis_2[i],k+1,lis_2[i+1]-1,k+1])
return lis_3
def shu_hebin(lisg):
lis_3=[]
for k in range(len(lisg[0])):
lis_1=[s[k] for s in lisg]
lis_2=[]
for j,ss in enumerate(lis_1):
if ss is not None:
lis_2.append(j)
lis_2.append(len(lis_1))
for i in range(len(lis_2)-1):
if lis_2[i+1]-lis_2[i]>1:
if (k in [1] and lis_2[i] in [1,2,3,4,5]) or (k in [7,9] and lis_2[i] in [7,9]):
lis_3.append([k,lis_2[i]+1,k,lis_2[i+1]])
return lis_3
def run_hen_shu(lisg):
list_0=[]
for ss in lisg:
shu=shu_hebin(ss)
hen=heng_hebin(ss)
henshu=[[5,4,8,7]]
list_0.append(shu+hen+henshu)
return list_0
def run_1(url,biao):
lis=open_pdf_1(url,biao)
he_bin_geshi=run_hen_shu(lis)
to_excels(lis)
sk_k_all=[]
for he_geshi in he_bin_geshi:
sk_k=[abc[s[0]]+str(s[1])+':'+abc[s[2]]+str(s[3]) for s in he_geshi]
sk_k_all.append(sk_k)
he_bing(sk_k_all)
if __name__ == '__main__':
abc=['A','B','C','D','E','F','G','H','I','J']
url_0=r'D:\pdf_xlsx\pdf_f\biao2.pdf'
to_url=url_0.replace('.pdf','_1.xlsx')
to_url_1=url_0.replace('.pdf','_1_1.xlsx')
biao_tou=['表格标题']
run_1(url_0,biao_tou)
总结
本文仅对一个简单PDF文档进行表格提取,主要使用了 pdfplumber 库来操作PDF文档,提取表格信息,该库的功能十分强大,操作方便,值得推荐。本文对提取的两个表格,根据表格标志行,进行分割,独立保存到Excel 表上,需对Excel表格进行格式化。
更新:格式化的代码已经写好,进行整理,完整代码已发表。
本文是我这个初学的业余爱好者所写,不足之处,敬请各们大侠指正。