从巨潮网批量爬取三板公司年报

最新推荐文章于 2025-05-23 09:16:47 发布

yioo_0825

最新推荐文章于 2025-05-23 09:16:47 发布

阅读量673

点赞数 3

文章标签： python 爬虫前端

本文链接：https://blog.csdn.net/yioo_0/article/details/143798337

版权

1.在b站和CSDN找到很多批量爬取年报的方法。我找到的大部分代码不区分公告类型，但是我只需要年报。

2.我想要的是三板挂牌公司的年度报告，我找到的方法下载的都是主板上市公司的公告。怎样将参数中的证券类型替换为三板也难倒了我。自学摸索，终于在开发人员工具中找到巨潮的三板的参数

"column": "third",

"category": "category_dqgg"

3.我不需要年度报告摘要，或英文、或半年度报告等不符合我下载需要的文件。在B站找到一个视频链接，设置了下载条件。奇怪的是按照up主的代码是可以下载主板上市公司公告的，换成三板的参数，下载的pdf都只有1kb，打不开的。

python爬取巨潮资讯上市公司年报_哔哩哔哩_bilibili

4.找到这个链接，很简洁的代码，关键是真的管用。我把b站up主设定的下载条件添加到这个链接提供的代码里。三板的年度报告终于可以批量下载了。

【工作提效】教你如何用Python轻松爬取上市公司年报，新手也能快速上手！copy即用_教你如何用python轻松爬取上市公司年报,新手也能快速上手-CSDN博客

以下是下载2023-11-01~2024-06-30日的新三板挂牌公司年度报告的源代码。

import requests
import pandas as pd
import os

# 请求头
headers = {
    "accept": "*/*",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
    "proxy-connection": "keep-alive",
    "x-requested-with": "XMLHttpRequest"
}

# 指定完整路径
base_path = 'D:\\PycharmProjects\\pythonProject1\\Report_2023'
announcements_dir = os.path.join(base_path, 'announcements')
# 创建存储PDF文件的文件夹
os.makedirs(announcements_dir, exist_ok=True)


# 获取公告数据的函数
def fetch_announcements(start_date, end_date, page_num=1, page_size=30):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    body = {
        "pageNum": page_num,
        "pageSize": page_size,
        "column": "third",
        "tabName": "fulltext",
        "plate": "neeq",
        "stock": "",
        "searchkey": "",
        "secid": "",
        "category": "category_dqgg",
        "trade": "农、林、牧、渔业;电力、热力、燃气及水生产和供应业;交通运输、仓储和邮政业;科学研究和技术服务业;教育;综合;卫生和社会工作;"
                 "水利、环境和公共设施管理业;住宿和餐饮业;建筑业;采矿业;文化、体育和娱乐业;居民服务、修理和其他服务业;租赁和商务服务业;房"
                 "地产业;信息传输、软件和信息技术服务业;批发和零售业;制造业",
        "seDate": f"{start_date}~{end_date}",
        "sortName": "",
        "sortType": "",
        "isHLtitle": "true"
    }

    # 发送 POST 请求
    response = requests.post(url, headers=headers, data=body)

    if response.status_code == 200:
        data = response.json()
        if 'announcements' in data:
            announcements_list = []
            for announcement in data['announcements']:
                sec_code = announcement.get("secCode", "")
                sec_name = announcement.get("secName", "")
                title = announcement.get("announcementTitle", "")
                download_url = f"http://static.cninfo.com.cn/{announcement.get('adjunctUrl', '')}"

                # 筛选条件：只保留年度报告，且排除包含“公告”、“摘要”、“已取消”和“英文”的标题
                if ('年度报告' in title) and ('公告' not in title) and ('摘要' not in title) and (
                        '已取消' not in title) and ('英文' not in title) and ('半年度' not in title):
                    announcements_list.append([sec_code, sec_name, title, download_url])
            print(f"Fetched {len(announcements_list)} announcements. hasMore is {data['hasMore']}")
            return announcements_list, data['hasMore']
        else:
            print("No announcements found that match the criteria.")
            return [], False
    else:
        print(f"Request failed with status code: {response.status_code}")
        return [], False


# 下载PDF文件的函数
def download_pdf(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename}, Status code: {response.status_code}")


# 主函数
def main():
    try:
        start_date = "2023-11-01"  # 注意日期格式，原代码中的日期可能是错误的（月份和日期颠倒）
        end_date = "2024-06-30"
        page_size = 30
        page_num = 1
        all_announcements = []

        while True:
            announcements, has_more = fetch_announcements(start_date, end_date, page_num=page_num, page_size=page_size)
            if announcements:
                all_announcements.extend(announcements)
                if not has_more:
                    break
                page_num += 1
            else:
                break

        if all_announcements:
            # 使用 pandas 创建 DataFrame
            df = pd.DataFrame(all_announcements, columns=["公司代码", "公司名称", "年报标题", "下载地址"])

            # 将 DataFrame 保存到 Excel 文件中
            excel_path = os.path.join(base_path, "announcements.xlsx")
            df.to_excel(excel_path, index=False)
            print(f"数据已保存到 {excel_path} 文件中")

            # 下载 PDF 文件
            for announcement in all_announcements:
                sec_code, sec_name, title, download_url = announcement
                # 进一步处理标题以避免文件名冲突
                safe_title = title.replace("/", "_").replace("\\", "_").replace(":", "_").replace("*", "_").replace("?",
                                                                                                                    "_").replace(
                    "\"", "_").strip()
                filename = f"{announcements_dir}\\{sec_code}_{sec_name}_{safe_title}.pdf"
                download_pdf(download_url, filename)
    except Exception as e:
        print(f"发生错误：{e}")


# 执行主函数
if __name__ == "__main__":
    main()