import json
import pandas as pd
df1=pd.read_csv(r'card数据\aro_index.tsv',sep='\t',header=0)
print(df1)
print(df1['AMR Gene Family'])
print(df1.iloc[:,8])
list_family=['TEM beta-lactamase','glycopeptide resistance gene cluster;van ligase','major facilitator superfamily (MFS) antibiotic efflux pump','methicillin resistant PBP2','Penicillin-binding protein mutations conferring resistance to beta-lactam antibiotics','Erm 23S ribosomal RNA methyltransferase','SHV beta-lactamase','CTX-M beta-lactamase','OXA beta-lactamase','KPC beta-lactamase','NDM beta-lactamase','IMP beta-lactamase','VIM beta-lactamase','ROB beta-lactamase','fluoroquinolone resistant gyrA']
list_gene1=[]
for i in range(len(df1)):
for j in list_family:
if j==df1.iloc[i,8]:
# print(df1['Model Name'][i])
list_gene1.append(df1['Model Name'][i])
# list_gene=list(set(list_gene))
print(list_gene1)
# list_gene.sort()
# print(list_gene)
# with open(r'')
print(len(list_gene1))
file=open(r'card.json').read()
dict1=json.loads(file)
# # print(dict1)
# print(dict1.keys())
# print(dict1['5721'])
# print(dict1['5721'].keys()) # dict_keys(['model_id', 'model_name', 'model_type', 'model_type_id', 'model_description', 'model_param', 'model_sequences', 'ARO_accession', 'ARO_id', 'ARO_name', 'ARO_description', 'ARO_category'])
# print(list(dict1.keys()))
list1=list(dict1.keys())[0:-3]
print(list1)
# print(dict1[list1[0]]['model_sequences'])
# print(dict1[list1[0]])
# print(dict1[list1[0]].keys())
# print(dict1[list1[0]]['model_id']) # 2
# print(dict1[list1[0]]['model_name']) # CblA-1
# print(dict1[list1[0]]['model_description']) # The protein homolog model is an AMR detection model. Protein homolog models detect a protein sequence based on its similarity to a curated reference sequence. A protein ...
# print(dict1[list1[0]]['ARO_accession']) # 3002999
#
# print(dict1[list1[0]]['ARO_id']) # 39433
# print(dict1[list1[0]]['ARO_name']) # CblA-1
# print(dict1[list1[0]]['ARO_description']) # CblA-1 beta-lactamase is a class A beta-lactamase found in Bacteroides uniformis that is species-specific
# print(dict1[list1[0]]['ARO_category'])
# print(dict1[list1[0]]['ARO_category'].keys()) # dict_keys(['39432', '41256', '35951', '36000'])
# print(dict1[list1[0]]['ARO_category'][list(dict1[list1[0]]['ARO_category'].keys())[0]].keys()) # dict_keys(['category_aro_accession', 'category_aro_cvterm_id', 'category_aro_name', 'category_aro_description', 'category_aro_class_name'])
# print(dict1[list1[0]]['ARO_category'][list(dict1[list1[0]]['ARO_category'].keys())[0]]['category_aro_accession']) # 3002998
# print(dict1[list1[0]]['ARO_category'][list(dict1[list1[0]]['ARO_category'].keys())[0]]['category_aro_cvterm_id']) # 39432
# print(dict1[list1[0]]['ARO_category'][list(dict1[list1[0]]['ARO_category'].keys())[0]]['category_aro_name']) # CblA beta-lactamase
# print(dict1[list1[0]]['ARO_category'][list(dict1[list1[0]]['ARO_category'].keys())[0]]['category_aro_description']) # CblA beta-lactamases are class A beta-lactamases that confer resistance to cephalosporins.
# print(dict1[list1[0]]['ARO_category'][list(dict1[list1[0]]['ARO_category'].keys())[1]]['category_aro_description']) # CblA beta-lactamases are class A beta-lactamases that confer resistance to cephalosporins.
# print('------------'*6)
# print(dict1[list1[0]]['model_sequences'].keys()) #dict_keys(['sequence'])
# print(dict1[list1[0]]['model_sequences']['sequence'])
#
# print(dict1[list1[0]]['model_sequences']['sequence'].keys()) #dict_keys(['1188'])
# print(dict1[list1[0]]['model_sequences']['sequence'][list(dict1[list1[0]]['model_sequences']['sequence'])[0]])
#
# print(dict1[list1[0]]['model_sequences']['sequence'][list(dict1[list1[0]]['model_sequences']['sequence'])[0]].keys()) # dict_keys(['protein_sequence', 'dna_sequence', 'NCBI_taxonomy'])
#
# print(dict1[list1[0]]['model_sequences']['sequence'][list(dict1[list1[0]]['model_sequences']['sequence'])[0]]['dna_sequence'])
# print(dict1[list1[0]]['model_sequences']['sequence'][list(dict1[list1[0]]['model_sequences']['sequence'])[0]]['dna_sequence']['sequence']) #dna fasta序列
# print(dict1[list1[0]]['model_sequences']['sequence'][list(dict1[list1[0]]['model_sequences']['sequence'])[0]]['NCBI_taxonomy']) # {'NCBI_taxonomy_cvterm_id': '39659', 'NCBI_taxonomy_name': 'mixed culture bacterium AX_gF3SD01_15', 'NCBI_taxonomy_id': '663108'}
for i in range(len(list1)):
if dict1[list1[i]]['model_name']=='mecA':
print(dict1[list1[i]]['model_name'])
print(dict1[list1[i]])
print(dict1[list1[i]]['model_sequences'])
# list_gene1=[]
# with open(r'需要提取的序列-1.txt','r') as f:
# # f.readline()
# for line in f:
# list_gene=line.strip('\n')
# # line.split('\t')
# list_gene1.append(list_gene)
# print(list_gene1)
line1=''
for i in range(len(list1)):
for j in list_gene1:
# if j in dict1[list1[i]]['model_name']:
if j == dict1[list1[i]]['model_name']:
print(dict1[list1[i]]['model_name'])
# print(dict1[list1[i]])
# print(dict1[list1[i]]['model_sequences'])
if 'model_sequences' not in list(dict1[list1[i]].keys()):
pass
else:
print(dict1[list1[i]]['model_sequences']['sequence'][list(dict1[list1[i]]['model_sequences']['sequence'])[0]]['dna_sequence']['sequence'])
line1=line1+'>'+dict1[list1[i]]['model_name']\
+','+dict1[list1[i]]['model_sequences']['sequence'][list(dict1[list1[i]]['model_sequences']['sequence'])[0]]['NCBI_taxonomy']['NCBI_taxonomy_name'] \
+ ',' + dict1[list1[i]]['model_sequences']['sequence'][list(dict1[list1[i]]['model_sequences']['sequence'])[0]]['NCBI_taxonomy']['NCBI_taxonomy_id'] + '\n'\
+dict1[list1[i]]['model_sequences']['sequence'][list(dict1[list1[i]]['model_sequences']['sequence'])[0]]['dna_sequence']['sequence']+'\n'
line1=line1[:-1]
print(line1)
with open(r'20220309所有需要提取的序列.fa','w') as f:
f.write(line1)
f.close()
# with open()
从耐药数据库card中提取fasta序列_20220302
最新推荐文章于 2024-07-29 17:17:32 发布