背景说明
本文选取的是当前日期上交所科创板所有的上市公司样本。
用八爪鱼从上交所公告页面爬取公告下载链接,使用迅雷批量下载。
阅读代码时注意点
流程思路:
1、提取pdf内容后,通过文本匹配锁定报表页码区间。
2、提取该区间所有表格保存在sheet表中。
3、继续依靠表尾文本匹配,分割sheet表保存到三个表页中。
4、将货币数字文本调整为可转换数字文本并统一单位量纲
注意点:
1、由于同一张报表跨页所以含“项目”的表头出现了不止一次,可以通过条件筛选直接跳过这一行的输出。
2、PyPDF2对中文支持不好,所以选用了pdfplumber模块进行处理。
3、发现利润表最后一项不一致,选择靠近后排的“综合收益总额”,如果这一行后面也要的话可以引入sign标记辅助判断
4、单位有万元和元两种type
5、运行时间似乎太长,可以探索多线程执行
源代码
import os
import pdfplumber
import re
from openpyxl import Workbook
input=r'C:\Users\huang\Desktop\招股意向书'
def pdf2tab(file,input=r'C:\Users\huang\Desktop\招股意向书',output=r'C:\Users\huang\Desktop\输出',tle=['合并资产负债表','合并利润表','合并现金流量表']):
os.chdir(input)
with pdfplumber.open(file) as pdf:
i = 0
flag = 0
for page in pdf.pages:
text = page.extract_text()
ResSearch = re.search("合并资产负债表 \n单位:元", text)
if ResSearch != None:
flag = 1
p1 = i
typ=1
print("type=1")
else:
res = re.search("合并资产负债表 \n单位:万元", text)
if res != None:
flag = 1
p1 = i
typ=2
print("type=2")
else:
res = re.search("资产负债表 \n单位:元", text)
if res != None:
flag = 1
p1 = i
typ=1
print("type=1")
else:
res = re.search("资产负债表 \n单位:万元", text)
if res != None:
flag = 1
p1 = i
typ=2
print("type=2")
if flag == 1:
Res = re.search("编制基础", text)
if Res != None:
p2 = i
i = i + 1
print(p1, p2)
workbook = Workbook()
sheet = workbook.active
for p in range(p1, p2 + 1, 1):
page = pdf.pages[p]
table = page.extract_table()
for row in table:
sheet.append(row)
flag = 1
j = 1
sign = 0
for row in sheet.rows:
i = 65
if flag <= 3:
if row[0].value.find('项目') != -1 and (j == 1 or sign == 1):
# print("very good")
if sign == 1:
j = 1
sign = 0
workbook.create_sheet(index=flag, title=tle[flag - 1])
# mysheet = workbook.get_sheet_by_name(tle[flag-1])方法过时有bug?
mysheet = workbook[tle[flag - 1]]
for cell in row:
c = cell.value
tmp = chr(i) + str(j)
mysheet[tmp] = c
i = i + 1
j = j + 1
if row[0].value.find('项目') == -1 and j > 1:
# print("find")
if typ == 1:
for cell in row:
c = cell.value
if i > 65:
if type(c) is type('a'):
ResSearch = re.search(",", c)
if ResSearch != None:
c = c.replace(",", '')
tmp = chr(i) + str(j)
mysheet[tmp] = c
i = i + 1
j = j + 1
else:
for cell in row:
c = cell.value
if i > 65:
if type(c) is type('a'):
ResSearch = re.search(",", c)
if ResSearch != None:
c = c.replace(",", '')
ResSearch = re.search(".", c)
if ResSearch != None:
c = c.replace(".", '') + '00'
tmp = chr(i) + str(j)
mysheet[tmp] = c
i = i + 1
j = j + 1
else:
break
if flag == 3 and row[0].value.find('期末现金及现金等价物余额') != -1:
flag = 4
j = 1
else:
if flag == 2 and row[0].value.find('综合收益总额') != -1:
flag = 3
sign = 1
else:
if flag == 1 and row[0].value.find('负债和') != -1:
flag = 2
j = 1
workbook.save(output + "\\" + file + ".xlsx")
os.chdir(input)
for file in os.listdir():
if os.path.splitext(file)[1]=='.pdf':
pdf2tab(file)