下载脚本
由于频繁下载会出现429错误,所以加了延时和重试,但是错误仍然不可避免,可以反复运行代码进行下载
import requests
import os
from urllib.parse import urlparse, parse_qs
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
def download_file_from_link(url, output_dir, max_retries=5, delay=15):
# 解析URL,获取文件名
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
file_name = query_params.get('fileName', ['unknown'])[0]
# 构造本地文件路径
local_file_path = os.path.join(output_dir, file_name)
for attempt in range(max_retries):
try:
# 在每次请求之前等待指定的延迟时间(秒)
time.sleep(delay)
# 发送GET请求并保存文件
with requests.get(url, stream=True, headers=headers) as r:
r.raise_for_status() # 如果请求失败,这将引发HTTPError异常
with open(local_file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"Downloaded {local_file_path} (attempt {attempt+1}/{max_retries})")
return # 下载成功,退出循环
except requests.exceptions.RequestException as e:
print(f"Error downloading {url} (attempt {attempt+1}/{max_retries}): {e}")
if attempt == max_retries - 1:
print(f"Max retries reached for {url}. Giving up.")
# 这里可以选择抛出一个异常,或者记录日志等
# 读取文本文件中的链接,每行一个链接
file_path = 'data.txt' # 假设links.txt与脚本在同一目录下,且每行包含一个链接
output_dir = 'data' # 下载文件的输出目录
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(file_path, 'r', encoding='utf-8') as file: # 假设链接文件是UTF-8编码
links = file.readlines()
# 遍历链接列表并下载文件
for link in links:
link = link.strip() # 去除链接前后的空白字符
if link:
download_file_from_link(link, output_dir)
print("All files have been processed.")
找出未成功下载的文件脚本
如果最后仅有个位数文件未被下载,可以先找出未被成功下载的文件然后手动下载
import os
# 文本文件和下载目录的路径
text_file_path = 'data.txt' # 假设这个文件包含你要检查的URL列表
download_dir = 'data' # 下载文件的目录
# 读取文本文件中的链接
with open(text_file_path, 'r', encoding='utf-8') as file:
links = file.readlines()
# 遍历链接列表并检查文件是否存在
for link in links:
link = link.strip() # 去除链接前后的换行符和空白字符
if link:
# 解析URL,获取文件名
file_name_start = link.rfind('fileName=') + 9
file_name = link[file_name_start:].split('&')[0] # 提取fileName参数的值
# 构造本地文件路径
local_file_path = os.path.join(download_dir, file_name)
# 检查文件是否存在
if os.path.exists(local_file_path):
# print(f"{file_name} 已下载。")
pass
else:
print(f"{file_name} 未下载。")
上述脚本请自行修改文件路径。