要找财报里的某个数据,根据网上代码自己缝缝补补写了个爬虫,记录一下(复制粘贴万岁)。好处是能完成任务,缺点是巨慢无比,代码主要参考以下知乎文章:如何用PYTHON爬到巨潮资讯里几个上市公司的年度报告?
1. 爬取url——函数定义
(get_org_id是因为巨潮的stock查找好像不仅要stkcd,还需要orgid)
import requests
import re
import openpyxl
import time
#定义函数获取org_id
def get_org_id(code):
org_id_url = "http://www.cninfo.com.cn/new/information/topSearch/query"
org_id_data = {
"keyWord": code,
"maxNum": "10",
}
org_id_response = requests.post(url=org_id_url, data=org_id_data, headers=headers).json()
# 从response中获取org_id
for data in org_id_response:
org_id = data["orgId"]
return org_id
# org_id_response.close()
return None
#定义函数获取url
def get_url(stkcd, orgid, date):
data = {
"stock": "{},{}".format(stkcd, orgid),
"tabName": "fulltext",
"pageSize": 30,
"PageNum": 1,
"category": "category_ndbg_szsh", #ndbg:年度报告
"seDate": str(date) + "-01-01~" + str(date) + "-12-31", #seDate:查询时间
"isHLtitle": "true"
}
#提交请求
response = requests.post(url, data=data, headers=headers)
response.raise_for_status()
#获取年报数据,数据格式为json,解析并获取json中的年报信息
results = response.json()["announcements"]
#避免空结果报错
if results is None:
return "None", "None", "None"
for i in results:
#避免下载一些年报摘要等不需要的文件
if re.search(r'摘要|(已取消)|财务指标',i['announcementTitle']):
pass
else:
title=i['announcementTitle']
#获取公告文件名,并在下载前将公告文件名中带*号的去掉,因为文件命名规则不能带*号,否则程序会中断
secName=i['secName']
secName=secName.replace('*','')
#获取公司股票代码
secCode=i['secCode']
#获取adjunctUrl,并组合生成pdf文件下载地址(分析得知巨潮资讯数据库pdf下载地址格式:http://static.cninfo.com.cn/+adjunctUrl)
adjunctUrl=i['adjunctUrl']
down_url='http://static.cninfo.com.cn/'+adjunctUrl
return title, secCode, down_url
#避免空结果报错
return "None", "None", "None"
1. 爬取url——执行程序:根据code.xlsx中的代码和年份爬取年报
import xlrd
import openpyxl
from openpyxl import load_workbook
url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Length": "178",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "www.cninfo.com.cn",
"Origin": "http://www.cninfo.com.cn",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
#根据证券代码获取对应年份年报
if __name__ == '__main__':
df = load_workbook('code.xlsx')
sheet3 = df.worksheets[3]
stkcds = [item.value for item in list(sheet3.columns)[0]]
years = [item.value for item in list(sheet3.columns)[1]]
lst = []
for i in range(1,len(codes)):
stkcd = codes[i]
year = years[i]+1
orgid = get_org_id(stkcd)
File_name, Code, Url = get_url(stkcd, orgid, year)
lst.append([File_name, Code, Url])
print(File_name, "已完成")
time.sleep(3)
#结果写入excel
wb=openpyxl.Workbook()
sht=wb.active
for row in lst:
sht.append(row)
wb.save("urls.xlsx")
2. 查找指标——定义函数
#-*- coding:utf-8 -*-
import csv
import re
import io
import os
import time
import pdfplumber
from PyPDF2 import PdfReader
import pandas as pd
import requests
def download_pdf(save_path, pdf_name, pdf_url):
requests.DEFAULT_RETRIES = 3
r = requests.get(pdf_url)
r.keep_alive = False
bytes_io = io.BytesIO(r.content)
with open(save_path + "%s.PDF" % pdf_name, mode='wb') as f:
f.write(bytes_io.getvalue())
print('%s.PDF,下载成功!' % (pdf_name))
# 定义查找函数定位文本所在页数
def PDF_FindText(xFile, xString):
# xfile: the PDF file in which to look
# xString: the string to look for
PageFound = -1
pdfFileObj = open(xFile, 'rb')
pdfDoc = PdfReader(pdfFileObj, strict = False)
for i in range(0, len(pdfDoc.pages)):
content = ""
content += pdfDoc.pages[i].extract_text() + "\n"
#content1 = content.encode('ascii', 'ignore').lower() 转换格式
ResSearch = re.search(xString, content)
if ResSearch is not None:
PageFound = i
break
pdfFileObj.close()
return PageFound
#提取某页表格
def extract_tables(filepath):
with pdfplumber.open(filepath) as pdf:
p_start = PDF_FindText(filepath, "被购买方于购买日可辨认资产、负债") #关键词
p_end = PDF_FindText(filepath, "购买日之前持有的股权按照公允价值重新计量产生的利得或损失")
if p_start == -1:
return "None"
if p_end - p_start > 30 or p_end == -1:
return filepath + "查询结果异常,请手动查询"
table = []
for i in range(p_start, p_end+1):
page = pdf.pages[i]
table.append(page.extract_tables())
return table
#text_list = []
#flag = 0
2. 查找指标——运行函数:下载PDF,并将提取出的信息写入新文件
text_list = []
with open(r"C:.\urls.csv", "r+", newline='')as f:
reader = csv.reader(f)
head = next(reader)
for i,row in enumerate(reader):
if row[1] == "None":
text_list.append("年报文件缺失")
else:
saving_path = "C:/Users/xxx/Desktop/年报/"
file_name = row[0]
file_path = saving_path + file_name + ".PDF"
download_pdf(saving_path, file_name, row[1])
time.sleep(10)
text = extract_tables(file_path)
text_list.append(text)
time.sleep(5)
os.remove(file_path)
print(i, file_path, "执行成功:)")
with open(r"C:.\结果.csv", "w", newline='', encoding = "UTF-8")as f:
writer = csv.writer(f)
for i in range(500):
writer.writerow([i+1+3000, text_list[i]])
3. 结果展示
根据关键词查到指标的text_list如图,我这里查询的是“财务报表附注—合并范围变更—非同一控制下合并—被购买方于购买日可辨认资产、负债”的结果。
稍微清理一下,整理成dataframe,再根据表头提取需要的部分即可。但是有部分表格的处理结果还是不太理想,经常有断行的情况╥﹏╥,不知道有没有更好用的pdf表格提取工具。