ncbi虽然有提供专门的下载的方式,但是比较复杂,如果想知道怎么下载的,可以私聊我。
我这里所提供的是我改过的,可以更加方便,快捷;话不多说,批量下载开始!
需要先用conda安装一个 datasets软件,然后就可以使用咯
import argparse
import zipfile
import glob
from pathlib import Path
import subprocess
import shutil
#autor = Zhou.wangyi
#email = 772967843@qq.com
def download(inputfile,outputfile,include = 'gff3'):
assison_set = set()
inputfile_abs = Path(inputfile).resolve()
outputfile_abs = Path(outputfile).resolve()
with open (inputfile_abs, 'r') as f:
for line in f:
line = line.strip()
assison_set.add(line)
subprocess.run(f'''
datasets download genome accession {line} --include {include} --filename {outputfile_abs}/{line}.zip
''',shell=True)
zip_list = glob.glob(f'{outputfile_abs}/*.zip')
for zipfn in zip_list:
prefix0=str(Path(zipfn).absolute().name)
prefix=prefix0.replace('.zip','')
with zipfile.ZipFile(f'{zipfn}','r')as zip_file:
zip_file.extract(f'ncbi_dataset/data/{prefix}/genomic.gff',f'./{prefix}/')
rawpath=f'./{prefix}/ncbi_dataset/data/{prefix}/genomic.gff'
newpath=f'./{prefix}.gff'
shutil.move(rawpath,newpath)
shutil.rmtree(f"./{prefix}")
def main():
parser = argparse.ArgumentParser(description='Download genomic data from NCBI')
parser.add_argument('-i', '--inputfile', type=str, required=True, help='Input file containing NCBI accession numbers')
parser.add_argument('-o', '--outputfile', type=str, required=True, help='Output directory for downloaded data')
parser.add_argument('-include', '--include', type=str, default='gff3', help='Type of data to download (gff3, fasta, etc.)')
args = parser.parse_args()
download(args.inputfile,args.outputfile,args.include)
if __name__ == '__main__':
main()
输入参数为:python3 down.py -i list -o .,-i参数为你想要下载的accession_number号,-o参数为输出路径;