pip install pypdf2
pip install pdfplumber==0.5.14
利用pdfplumber提取文字
import pdfplumber
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())
利用pdfplumber提取表格
import pdfplumber
with pdfplumber.open("simple_1.pdf") as pdf:
first_page = pdf.pages[0]
print(first_page.extract_table())
利用pdfplumber提取多个简单的表格
import pdfplumber
with pdfplumber.open("simple_1.pdf") as pdf:
table_page = pdf.pages[0]
for table in table_page.extract_tables():
print(table)
需要设置一下.extract_table()方法里面的参数
import pdfplumber
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
table_page = pdf.pages[9]
table = table_page.extract_table(
table_settings = {
'vertical_strategy':"text",
"horizontal_strategy":"text",
})
print(table)
将获取的数据写到Excel中
import pdfplumber
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
table_page = pdf.pages[9]
table = table_page.extract_table(
table_settings = {
'vertical_strategy':"text",
"horizontal_strategy":"text",
})
from openpy