P12.Python提取PDF文字内容.md
#pdfplumber提取文字
pdfplumber.open(PDF路径)
pdf.pagess[页数]
page.extract_text()
例:
import pdfplumber
with pdfplumber.open('test.pdf') as pdf:
for page in pages:
print(page.extract_text())
#pdfplumber提取表格
page.extract_table()
例:
with pdfplumber.open('test.pdf') as pdf:
table_page = pdf.pages[0]
table = table.extract_table()
print(table)
#提取多个简单表格
page.extract_tables()
例:
with pdfplumber.open('test.pdf') as pdf:
table_page = pdf.pages[1]
table = table.extract_tables()
print(table)
#利用pdfplumber提取网易财报表格
table_settings:提取表格时的设定
例:
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
table_page = pdf.pages[9]
table = table.extract_table(
table_settings={
"vertical_strategy":"text",
"horizontal_strategy":"text",
})
print(table)
#写入到Excel表格中
例:
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
table_page = pdf.pages[9]
table = table.extract_table(
table_settings={
"vertical_strategy":"text",
"horizontal_strategy":"text",
})
workbook = Workbook()
sheet = workbook.active
for row in table:
sheet.append(row)
workbook.save(filename='Netease_Q2_2019_Earnings.xlsx')
#注意:存在空行和将单词切分到多个不同列的问题
#去除空行
简单判断,非空行的才加进来
将列表中每个元素都连接成一个字符串,如果还是空字符串那么肯定就是空行
new_table = []
for row in table:
if not ''.join([str(item) for iterm in row]) == '':
#''.join(列表)用法:拼接字符串,如print('~'.join([1,2,3]))输出1~2~3
#[str(item) for iterm in row]列表生成式,简单快速地生成一个python列表
#合并单词
分析结果可知前三列分布着第一列的单词
将前三列非None的内容合并为一个字符串,然后再合到一个列表里
new_row = []
new_row.append(''.join([str(item) if item else '' for item in row[:3]]))
new_row += row[3:]
new_table.append(new_row)
#行内条件判断 y = x*2 if x<10 else 20
#提取网易财报表格完整代码
import pdfplumber
from openpyxl import Workbook
with pdfplumber.open('Netease Q2 2019 Earnings Release-Final.pdf') as pdf:
table_page = pdf.pages[9]
table = table.extract_table(
table_settings={
"vertical_strategy":"text",
"horizontal_strategy":"text",
})
new_table = []
for row in table:
if not ''.join([str(item) for iterm in row]) == '':
new_row = []
new_row.append(''.join([str(item) if item else '' for item in row[:3]]))
new_row += row[3:]
new_table.append(new_row)
workbook = Workbook()
sheet = workbook.active
for row in new_table:
sheet.append(row)
workbook.save(filename='Netease_Q2_2019_Earnings.xlsx')