从sh文件中提取url
(1)指定sh文件和txt文件版
import re
# 提取URL的函数
def extract_urls(sh_file):
urls = []
with open(sh_file, 'r', encoding='utf-8') as file: # 显式指定编码为 utf-8
content = file.read()
# 使用正则表达式只提取URL
urls = re.findall(r"'(http\S+\.nc)'", content)
return urls
# 将提取的URL保存到txt文件的函数
def save_urls_to_txt(urls, output_txt):
# 'w' 模式会在每次写入时清空原文件内容
with open(output_txt, 'w', encoding='utf-8') as file: # 同样指定编码为 utf-8
for url in urls:
file.write(url + '\n')
# 指定输入的.sh文件和输出的.txt文件
input_sh_file = 'EC-Earth3-Veg_historical.sh'
output_txt_file = 'EC-Earth3-Veg_historical1.txt'
# 提取并保存
urls = extract_urls(input_sh_file)
save_urls_to_txt(urls, output_txt_file)
print(f"URL 已成功保存到 {output_txt_file}")
(2)直接从sh文件夹和txt文件夹提取版
import os
import re
# 提取URL的函数
def extract_urls(sh_file):
urls = []
with open(sh_file, 'r', encoding='utf-8') as file: # 显式指定编码为 utf-8
content = file.read()
# 使用正则表达式只提取URL
urls = re.findall(r"'(http\S+\.nc)'", content)
return urls
# 将提取的URL保存到txt文件的函数
def save_urls_to_txt(urls, output_txt):
# 'w' 模式会在每次写入时清空原文件内容
with open(output_txt, 'w', encoding='utf-8') as file: # 同样指定编码为 utf-8
for url in urls:
file.write(url + '\n')
# 处理目录中的所有.sh文件
def process_sh_files(sh_dir, txt_dir):
for filename in os.listdir(sh_dir):
if filename.endswith('.sh'):
sh_file_path = os.path.join(sh_dir, filename)
txt_file_path = os.path.join(txt_dir, filename.replace('.sh', '.txt'))
# 提取并保存URL
urls = extract_urls(sh_file_path)
save_urls_to_txt(urls, txt_file_path)
print(f"提取的URL已成功保存到 {txt_file_path}")
# 指定输入.sh文件夹和输出.txt文件夹
sh_directory = 'D:/Temprture/sh文件/sh2' # .sh文件夹路径
txt_directory = 'D:/Temprture/txt文件' # .txt文件夹路径
# 执行处理
process_sh_files(sh_directory, txt_directory)
根据url文本文件单线程下载.nc文件
单线程(试过多线程但是会导致文件下载不完整)!且支持续下(即从上一次未下载完的地方接着下)。支持跳过已经下载过的文件。
import os
import requests
import logging
from time import sleep
# 定义txt文件的路径和目标文件夹
txt_file_path = 'D:/Temprture/txt文件/EC-Earth3-Veg-LR_historical.txt'
target_dir = 'G:/so/EC-Earth3-Veg-LR/historical/'
# 如果目标文件夹不存在,则创建它
os.makedirs(target_dir, exist_ok=True)
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def is_file_complete(url, local_file_path):
"""检查本地文件是否已经完整"""
try:
response = requests.head(url)
response.raise_for_status()
# 获取服务器上的文件大小
total_size = int(response.headers.get('content-length', 0))
# 比较本地文件大小和服务器文件大小
if os.path.exists(local_file_path):
local_file_size = os.path.getsize(local_file_path)
if local_file_size == total_size:
logging.info(f"{local_file_path} 已经完整下载,跳过。")
return True
return False
except requests.RequestException as e:
logging.error(f"无法获取文件大小: {url}. 错误信息: {e}")
return False
def download_file(url, retry_count=3):
""" 下载单个文件,支持断点续传,并且在失败时重试 """
if not url:
return
file_name = os.path.join(target_dir, url.split('/')[-1]) # 获取文件名并指定保存路径
# 检查文件是否已经完整下载
if is_file_complete(url, file_name):
return
file_size = os.path.getsize(file_name) if os.path.exists(file_name) else 0 # 已下载文件大小
headers = {'Range': f'bytes={file_size}-'} if file_size > 0 else {}
logging.info(f'正在下载 {url} 到 {file_name}, 已下载: {file_size} 字节')
try:
response = requests.get(url, stream=True, headers=headers)
response.raise_for_status() # 如果响应有问题,抛出异常
total_size = int(response.headers.get('content-length', 0)) + file_size # 服务器文件大小
with open(file_name, 'ab' if file_size > 0 else 'wb') as f:
for chunk in response.iter_content(chunk_size=65536):
if chunk:
f.write(chunk)
# 验证文件是否完整
if os.path.getsize(file_name) != total_size:
logging.warning(f'{file_name} 下载不完整,重试中...')
os.remove(file_name)
if retry_count > 0:
sleep(2) # 等待2秒后重试
download_file(url, retry_count - 1)
else:
logging.error(f'下载失败,超过重试次数: {url}')
else:
logging.info(f'{file_name} 下载完成!')
except requests.exceptions.RequestException as e:
if e.response and e.response.status_code == 416:
logging.info(f"{file_name} 已经完整下载,跳过。")
else:
logging.error(f'下载失败: {url}. 错误信息: {e}')
if retry_count > 0:
sleep(2) # 等待2秒后重试
download_file(url, retry_count - 1)
# 读取txt文件中的所有URL
with open(txt_file_path, 'r') as file:
urls = [line.strip() for line in file if line.strip()] # 获取所有非空的URL
# 单线程下载文件,逐个下载
for url in urls:
download_file(url)