1.在b站和CSDN找到很多批量爬取年报的方法。我找到的大部分代码不区分公告类型,但是我只需要年报。
2.我想要的是三板挂牌公司的年度报告,我找到的方法下载的都是主板上市公司的公告。怎样将参数中的证券类型替换为三板也难倒了我。自学摸索,终于在开发人员工具中找到巨潮的三板的参数
"column": "third",
"category": "category_dqgg"
3.我不需要年度报告摘要,或英文、或半年度报告等不符合我下载需要的文件。在B站找到一个视频链接,设置了下载条件。奇怪的是按照up主的代码是可以下载主板上市公司公告的,换成三板的参数,下载的pdf都只有1kb,打不开的。
python爬取巨潮资讯上市公司年报_哔哩哔哩_bilibili
4.找到这个链接,很简洁的代码,关键是真的管用。我把b站up主设定的下载条件添加到这个链接提供的代码里。三板的年度报告终于可以批量下载了。
【工作提效】教你如何用Python轻松爬取上市公司年报,新手也能快速上手!copy即用_教你如何用python轻松爬取上市公司年报,新手也能快速上手-CSDN博客
以下是下载2023-11-01~2024-06-30日的新三板挂牌公司年度报告的源代码。
import requests
import pandas as pd
import os
# 请求头
headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"proxy-connection": "keep-alive",
"x-requested-with": "XMLHttpRequest"
}
# 指定完整路径
base_path = 'D:\\PycharmProjects\\pythonProject1\\Report_2023'
announcements_dir = os.path.join(base_path, 'announcements')
# 创建存储PDF文件的文件夹
os.makedirs(announcements_dir, exist_ok=True)
# 获取公告数据的函数
def fetch_announcements(start_date, end_date, page_num=1, page_size=30):
url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
body = {
"pageNum": page_num,
"pageSize": page_size,
"column": "third",
"tabName": "fulltext",
"plate": "neeq",
"stock": "",
"searchkey": "",
"secid": "",
"category": "category_dqgg",
"trade": "农、林、牧、渔业;电力、热力、燃气及水生产和供应业;交通运输、仓储和邮政业;科学研究和技术服务业;教育;综合;卫生和社会工作;"
"水利、环境和公共设施管理业;住宿和餐饮业;建筑业;采矿业;文化、体育和娱乐业;居民服务、修理和其他服务业;租赁和商务服务业;房"
"地产业;信息传输、软件和信息技术服务业;批发和零售业;制造业",
"seDate": f"{start_date}~{end_date}",
"sortName": "",
"sortType": "",
"isHLtitle": "true"
}
# 发送 POST 请求
response = requests.post(url, headers=headers, data=body)
if response.status_code == 200:
data = response.json()
if 'announcements' in data:
announcements_list = []
for announcement in data['announcements']:
sec_code = announcement.get("secCode", "")
sec_name = announcement.get("secName", "")
title = announcement.get("announcementTitle", "")
download_url = f"http://static.cninfo.com.cn/{announcement.get('adjunctUrl', '')}"
# 筛选条件:只保留年度报告,且排除包含“公告”、“摘要”、“已取消”和“英文”的标题
if ('年度报告' in title) and ('公告' not in title) and ('摘要' not in title) and (
'已取消' not in title) and ('英文' not in title) and ('半年度' not in title):
announcements_list.append([sec_code, sec_name, title, download_url])
print(f"Fetched {len(announcements_list)} announcements. hasMore is {data['hasMore']}")
return announcements_list, data['hasMore']
else:
print("No announcements found that match the criteria.")
return [], False
else:
print(f"Request failed with status code: {response.status_code}")
return [], False
# 下载PDF文件的函数
def download_pdf(url, filename):
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {filename}")
else:
print(f"Failed to download: {filename}, Status code: {response.status_code}")
# 主函数
def main():
try:
start_date = "2023-11-01" # 注意日期格式,原代码中的日期可能是错误的(月份和日期颠倒)
end_date = "2024-06-30"
page_size = 30
page_num = 1
all_announcements = []
while True:
announcements, has_more = fetch_announcements(start_date, end_date, page_num=page_num, page_size=page_size)
if announcements:
all_announcements.extend(announcements)
if not has_more:
break
page_num += 1
else:
break
if all_announcements:
# 使用 pandas 创建 DataFrame
df = pd.DataFrame(all_announcements, columns=["公司代码", "公司名称", "年报标题", "下载地址"])
# 将 DataFrame 保存到 Excel 文件中
excel_path = os.path.join(base_path, "announcements.xlsx")
df.to_excel(excel_path, index=False)
print(f"数据已保存到 {excel_path} 文件中")
# 下载 PDF 文件
for announcement in all_announcements:
sec_code, sec_name, title, download_url = announcement
# 进一步处理标题以避免文件名冲突
safe_title = title.replace("/", "_").replace("\\", "_").replace(":", "_").replace("*", "_").replace("?",
"_").replace(
"\"", "_").strip()
filename = f"{announcements_dir}\\{sec_code}_{sec_name}_{safe_title}.pdf"
download_pdf(download_url, filename)
except Exception as e:
print(f"发生错误:{e}")
# 执行主函数
if __name__ == "__main__":
main()