一、任务描述
- 分别从上交所的官网上爬取年报问询函的记录
二、解决思路
- 解析网页获取全部的年报问询函列表及相应的文件链接
- 打开第一步获取的文件链接,读取 PDF 数据,并直接转成 TXT 格式的文字
三、网页分析
以上交所网站为例:
import logging
import os
import threading
import random
import time
from urllib.request import urlopen
from urllib.request import Request
import pandas as pd
# 设置日志记录器的配置
logging.basicConfig(filename='download.log', level=logging.INFO, format='%(asctime)s - %(message)s')
# 定义一个集合来存储已经下载过的URL
downloaded_urls = set()
# 根据获取的列表下载文件
df = pd.read_table('D:\\Users\\admin\\Documents\\testpython\\上交所列表.txt', sep=';', header=None, names=['Date', 'Title', 'StockCode', 'Type', 'Company', 'Doc', 'CreateTime', 'docURL'], encoding='GBK')
def get_file_from_url(docURL, Title):
# 随机更换User-Agent
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
# ...(其它的User-Agent)
]
time.sleep(random.uniform(10, 80)) # 添加随机延迟,范围在1到3秒之间
if isinstance(docURL, float):
docURL = str(docURL) # 将浮点数转换为字符串
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print('开始下载:%s' % Title)
headers = {
"User-Agent": random.choice(user_agents),
"Connection": "keep-alive",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}
request = Request(str(docURL), headers=headers) # 将 docURL 转换为字符串
fp = urlopen(request)
file_path = 'D:\\Users\\admin\\Documents\\testpython\\下载\\' + Title + '.pdf'
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
headers['Range'] = f'bytes={file_size}-' # 设置 Range 头部,从文件末尾开始下载
request = Request(str(docURL), headers=headers)
with open(file_path, 'ab') as file:
for chunk in fp:
file.write(chunk)
file_size += len(chunk)
if file_size % 1024 == 0:
print(f'已下载 {file_size / 1024} KB')
logging.info(f'{Title} 下载完成')
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
print('%s 下载完成' % Title)
# 创建一个线程列表
threads = []
# 设置同时连接的最大线程数
max_connections = 5
for index, row in df.iterrows():
# Date=row["Date"]
Title = row["Title"]
Title = str(Title) # 将浮点数转换为字符串
Title = Title.replace('*', '') # 替换特殊字符
docURL = row["docURL"]
# print(Date,Title,docURL)
if __name__ == '__main__':
url = 'http://' + docURL
# 检查URL是否已经下载过
if url not in downloaded_urls:
thread = threading.Thread(target=get_file_from_url, args=(url, Title))
thread.start()
threads.append(thread)
# 将URL添加到集合中
downloaded_urls.add(url)
# 等待所有线程结束
for thread in threads:
thread.join()