基本操作
#pdfplumber对表格的处理能力强于pdfminer
import pdfplumber
import re
path = r'E:\py_shiyan\Pycharm备份\Financial-data-collection-from-web--master\original_data/test3.pdf'
pdf = pdfplumber.open(path)
for page in pdf.pages:
print(page.extract_text())
print((len(pdf.pages))
for pdf_table in page.extract_tables():
print(pdf_table)
for row in pdf_table:
print(row)
进阶操作
对解析的pdf进行整理加工,逐行读取,去掉完全空白的行,将不完全空白的行合并
import pdfplumber
import re
path = r'E:\py_shiyan\Pycharm备份\Financial-data-collection-from-web--master\original_data/test3.pdf'
pdf = pdfplumber.open(path)
for page in pdf.pages:
table = []
cells = []
print(page.extract_text())
for pdf_table in page.extract_tables():
for row in pdf_table:
if not any(row):
# 如果一行全为空,则视为一条记