import requests
import os
import re
def download_file_insectbase(species_id):
# 构建FTP下载路径
base_url = "http://v2.insect-genome.com/api/Download/..-01_data-01_species-"
ftp_path = base_url + species_id + "-" + species_id + ".genome.fa.tar.bz2"
#print(f"Downloading from {ftp_path}")
# 下载文件
try:
kv = {'user-Agent': 'Mozilla/5.0'}
response = requests.get(ftp_path, headers = kv)
response.encoding = response.apparent_encoding
except:
print(f"第一次爬取失败!原因可能是在insectbase网站中不存在{species_id},或者网站拒绝接入!")
if response.status_code == 200:
# 从ftp_path中提取文件名
file_name = ftp_path.split('-')[-2] + ".genome.fa.tar.bz2"
print(file_name)
# 创建完整的文件路径
file_path = os.path.join(output_dir, file_name)
os.makedirs(output_dir, exist_ok=True)
# 将下载的内容写入文件
with open(file_path, 'wb') as file:
file.write(response.content)
print(f"Downloaded {file_name} to {output_dir}")
else:
print(f"Failed to download file from {ftp_path}. Status code: {response.status_code}")
def batch_download(species_ids_file):
# 读取物种名称列表
with open(species_ids_file, 'r') as file:
species_ids = [line.strip().replace(' ', '_') for line in file.readlines()]
# 批量处理
for species_id in species_ids:
print("正在尝试下载物种文件 {} ,请稍候。。。".format(species_id))
download_file_insectbase(species_id)
output_dir = "./downloads"
species_ids_file = "./species_ids.txt"
batch_download(species_ids_file)
print("文件中所有物种文件已经下载完毕!")
insect_download_all_file.py
最新推荐文章于 2024-09-24 08:48:12 发布