NCBI 的 taxonomy的nodes.dmp 如何向上查找指定等级的taxid
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 7 17:17:13 2021
@author: dujidan
"""
import sys
nodes_file = 'taxonomy/nodes.dmp'
def nodes2dict(nodes_file):
nodes_p_r_dict = {}
with open(nodes_file) as f_nodes:
node_data_list = [i.split('\t') for i in f_nodes.read().strip().split('\n')]
for line_list in node_data_list:
taxid = line_list[0]
parent_taxid = line_list[2]
rank = line_list[4]
nodes_p_r_dict[taxid] = {'parent_taxid': parent_taxid,
'rank': rank}
return nodes_p_r_dict
def find_parent_in_lvl(input_taxid, nodes_p_r_dict, taxonomy_lvl='genus', up_count=0):
up_count += 1
if up_count >= 20:
return False
parent_taxid = nodes_p_r_dict[input_taxid]['parent_taxid']
rank = nodes_p_r_dict[input_taxid]['rank']
# print(rank, taxonomy_lvl)
if rank == taxonomy_lvl:
# print(input_taxid, rank)
return input_taxid
else:
return find_parent_in_lvl(parent_taxid, nodes_p_r_dict, taxonomy_lvl, up_count)
def taxid2parent(input_taxid_list, taxonomy_lvl='genus', nodes_file='/home/sunxijun/ref/metagenome/kraken2_db/taxonomy/nodes.dmp'):
return_list = []
nodes_p_r_dict = nodes2dict(nodes_file)
# modified input list
if type(input_taxid_list) is list:
input_taxid_list = input_taxid_list
else:
input_taxid_list = eval(input_taxid_list)
# find taxid parent
for input_taxid in input_taxid_list:
if input_taxid not in nodes_p_r_dict.keys():
print(input_taxid, '\ttaxid_error')
return_list.append([input_taxid, 'taxid_error'])
else:
parent_taxid = find_parent_in_lvl(str(input_taxid), nodes_p_r_dict=nodes_p_r_dict, taxonomy_lvl='genus')
print(f'{input_taxid}\t{parent_taxid}')
return_list.append([input_taxid, parent_taxid])
return return_list
if __name__ == '__main__':
if len(sys.argv) != 2:
print(f'Error:\n python3 {sys.argv[0]} \t [taxid_1,taxid_2]')
sys.exit(1)
input_taxid_list = sys.argv[1] #[9606,35]
parent_taxid_list = taxid2parent(input_taxid_list)
输入要是列表形式的,list中不能有空格
默认是属(genus)的级别,如果想换其他等级,可以修改 taxonomy_lvl