爬完上交所的年报问询函-CSDN博客

本文链接：https://blog.csdn.net/sinat_30045277/article/details/143171954

一、任务描述

分别从上交所的官网上爬取年报问询函的记录

二、解决思路

解析网页获取全部的年报问询函列表及相应的文件链接
打开第一步获取的文件链接，读取 PDF 数据，并直接转成 TXT 格式的文字

三、网页分析

以上交所网站为例：

import logging

import os

import threading

import random

import time

from urllib.request import urlopen

from urllib.request import Request

import pandas as pd



# 设置日志记录器的配置

logging.basicConfig(filename='download.log', level=logging.INFO, format='%(asctime)s - %(message)s')



# 定义一个集合来存储已经下载过的URL

downloaded_urls = set()



# 根据获取的列表下载文件

df = pd.read_table('D:\\Users\\admin\\Documents\\testpython\\上交所列表.txt', sep=';', header=None, names=['Date', 'Title', 'StockCode', 'Type', 'Company', 'Doc', 'CreateTime', 'docURL'], encoding='GBK')



def get_file_from_url(docURL, Title):

    # 随机更换User-Agent

    user_agents = [

        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",

        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",

        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",

        # ...（其它的User-Agent）

    ]

    time.sleep(random.uniform(10, 80))  # 添加随机延迟，范围在1到3秒之间

    if isinstance(docURL, float):

        docURL = str(docURL)  # 将浮点数转换为字符串

    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

    print('开始下载：%s' % Title)

    headers = {

        "User-Agent": random.choice(user_agents),

        "Connection": "keep-alive",

        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",

        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"

    }

    request = Request(str(docURL), headers=headers)  # 将 docURL 转换为字符串

    fp = urlopen(request)

    file_path = 'D:\\Users\\admin\\Documents\\testpython\\下载\\' + Title + '.pdf'

    if os.path.exists(file_path):

        file_size = os.path.getsize(file_path)

        headers['Range'] = f'bytes={file_size}-'  # 设置 Range 头部，从文件末尾开始下载

    request = Request(str(docURL), headers=headers)

    with open(file_path, 'ab') as file:

        for chunk in fp:

            file.write(chunk)

            file_size += len(chunk)

            if file_size % 1024 == 0:

                print(f'已下载 {file_size / 1024} KB')

    logging.info(f'{Title} 下载完成')

    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

    print('%s 下载完成' % Title)

   

# 创建一个线程列表

threads = []

# 设置同时连接的最大线程数

max_connections = 5



for index, row in df.iterrows():

    # Date=row["Date"]

    Title = row["Title"]

    Title = str(Title)  # 将浮点数转换为字符串

    Title = Title.replace('*', '')  # 替换特殊字符

    docURL = row["docURL"]

    # print(Date,Title,docURL)

    if __name__ == '__main__':

        url = 'http://' + docURL

        # 检查URL是否已经下载过

        if url not in downloaded_urls:

            thread = threading.Thread(target=get_file_from_url, args=(url, Title))

            thread.start()

            threads.append(thread)

            # 将URL添加到集合中

            downloaded_urls.add(url)



# 等待所有线程结束

for thread in threads:

    thread.join()