例子:
20230506_conut_blast.revise.new
##
import pandas as pd
import argparse
parser=argparse.ArgumentParser(description='input error !')
parser.add_argument('-ip1','--inputpath1',required=True,help='input like NL230413-1_S113.blastn.xls')
parser.add_argument('-r','--reference',required=True,help='taxmap_slv_ssu_ref_nr_138.1.xls')
parser.add_argument('-op1','--outputpath1',required=True,help='NL230413-1_S113.blastn.result.xls')
parser.add_argument('-op2','--outputpath2',required=True,help='NL230413-1_S113.blastn.result.genus.xls')
parser.add_argument('-op3','--outputpath3',required=True,help='NL230413-1_S113.blastn.result.species.xls')
# inputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\NL230413-1_S113.blastn.xls'
# reference=r'C:\Users\Administrator\Desktop\自己测试捕获\taxmap_slv_ssu_ref_nr_138.1.xls'
# outputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\NL230413-1_S113.blastn.result.xls'
args = parser.parse_args()
inputpath1=args.inputpath1
reference=args.reference
outputpath1=args.outputpath1
outputpath2=args.outputpath2
outputpath3=args.outputpath3
# df1=pd.read_csv(inputpath1,sep='\t',header=None)
df_ref=pd.read_csv(reference,sep='\t')
dict_accession_to_genus = dict(zip(df_ref['primaryAccession'], df_ref['genus']))
dict_accession_to_species = dict(zip(df_ref['primaryAccession'], df_ref['species']))
dict_need_genus={}
dict_need_species={}
dict_need_genus1={}
dict_need_species1={}
with open(inputpath1,'r') as f2,open(outputpath1,'w') as f3:
for line in f2:
line=line.strip('\n')
list_in=line.split('\t')
list_in.append(dict_accession_to_genus[list_in[1]])
list_in.append(dict_accession_to_species[list_in[1]])
print(line)
# if list_in[0] in dict_need_genus:
# dict_need_genus[list_in[0]].append(list_in[-2])
# else:
# dict_need_genus[list_in[0]]=[list_in[-2]]
#
# if list_in[0] in dict_need_species:
# dict_need_species[list_in[0]].append(list_in[-1])
# else:
# dict_need_species[list_in[0]]=[list_in[-1]]
if list_in[0] in dict_need_genus1:
pass
else:
dict_need_genus1[list_in[0]]=list_in[-2]
if list_in[0] in dict_need_species1:
pass
else:
dict_need_species1[list_in[0]]=list_in[-2]
# for i in dict_need_genus:
# # dict_need1[i]=max(dict_need[i],key=dict_need[i].count)
# dict_need_genus1[i] = dict_need_genus[i][0]
#
# for i in dict_need_species:
# # dict_need1[i]=max(dict_need[i],key=dict_need[i].count)
# dict_need_species1[i] = dict_need_species[i][0]
##属
genus_value_counts=pd.value_counts(list(dict_need_genus1.values()))
dict_genus_value_counts=dict(zip(genus_value_counts.index,genus_value_counts.values))
df_genus_value_counts=pd.DataFrame.from_dict(dict_genus_value_counts,orient='index',columns=['count'])
df_genus_value_counts=df_genus_value_counts.rename_axis('genus').reset_index()
df_genus_value_counts.to_csv(outputpath2,sep='\t',index=None)
##种
species_value_counts = pd.value_counts(list(dict_need_species1.values()))
dict_species_value_counts = dict(zip(species_value_counts.index, species_value_counts.values))
print(dict_species_value_counts)
df_species_value_counts = pd.DataFrame.from_dict(dict_species_value_counts, orient='index', columns=['count'])
df_species_value_counts = df_species_value_counts.rename_axis('species').reset_index()
print(df_species_value_counts)
df_species_value_counts.to_csv(outputpath3,sep='\t',index=None)
20230508_bwa.10_genome.count
import pandas as pd
import click
@click.command()
@click.option('-ip1','--inputpath1',help='inputpath1(contig.mapped.mapped.sam|..)',required=True)
@click.option('-ip2','--inputpath2',help='inputpath2(NL230413-1_S113.inter.xls|..)',required=True)
# @click.option('-op1','--outputpath1',help='outputpath1(contig.mapped.mapped.inter.xls|..)',required=True)
# @click.option('-op1','--outputpath1',help='outputpath1(genus.count.xls|..)',required=True)
@click.option('-op2','--outputpath2',help='outputpath2(species.count.xls|..)',required=True)
def count_genus_species(inputpath1,inputpath2,outputpath2):
# inputpath1 = r'C:\Users\Administrator\Desktop\自己测试捕获\contig.mapped.mapped.20000.sam'
#
# inputpath2 = r'C:\Users\Administrator\Desktop\自己测试捕获\NL230413-1_S113.inter.xls'
# # outputpath1 = r'C:\Users\Administrator\Desktop\自己测试捕获\contig.mapped.mapped.20000.inter.xls'
#
# outputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\genus.count.xls'
# outputpath2=r'C:\Users\Administrator\Desktop\自己测试捕获\species.count.xls'
df2 = pd.read_csv(inputpath2, sep='\t')
# dict_contig_to_genus = dict(zip(df2['contig_name'], df2['genus']))
# dict_contig_to_species = dict(zip(df2['contig_name'], df2['species']))
# print(dict_contig_to_genus)
dict_contig_to_species = dict(zip(df2['accession'], df2['species']))
df1=pd.read_csv(inputpath1,sep='\t',quoting=3,usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12],header=None)
df1.columns=['0','1','2','3','4','5','6','7','8','9','10','11','12']
# print(df1)
df1=df1.drop_duplicates(subset=['0'],keep='first').reset_index(drop=True)
df1['2']=df1['2'].apply(lambda x:x.split(' ')[0])
df1['6']=df1['6'].apply(lambda x:x.split(' ')[0])
# list2=[item for item in dict_contig_to_genus]
# df1=df1[df1['2'].isin(list2)]
# list2.append('=')
# df1=df1[df1['6'].isin(list2)]
#
# print(df1['6'].tolist())
#
## part1
df_part1=df1[df1['2']==df1['6']]#.reset_index(drop=True)
# df_part1['genus']=df_part1['2'].apply(lambda x:dict_contig_to_genus[x.split(' ')[0]])
df_part1['species']=df_part1['2'].apply(lambda x:dict_contig_to_species[x.split(' ')[0]])
# df_part1=df_part1[['genus','species']]
print(df_part1)
#
## part2
df_part2=df1[df1['6']=='=']#.reset_index(drop=True)
# df_part2['genus']=df_part2['2'].apply(lambda x:dict_contig_to_genus[x.split(' ')[0]])
df_part2['species']=df_part2['2'].apply(lambda x:dict_contig_to_species[x.split(' ')[0]])
# df_part2=df_part2[['genus','species']]
print(df_part2)
#
#
## part3
df_part12=pd.concat([df_part1,df_part2])
# print(df_part12)
#
df_part3=df1[~df1['0'].isin(df_part12['0'])].reset_index(drop=True)
#
# # list2=[item+' ' for item in dict_contig_to_genus]
# df_part3=df_part3[df_part3['6'].isin(list2)]
#
#
# df_part3['genus1']=df_part3['2'].apply(lambda x:dict_contig_to_genus[x.split(' ')[0]])
# df_part3['genus2']=df_part3['6'].apply(lambda x:dict_contig_to_genus[x.split(' ')[0]])
df_part3['species1']=df_part3['2'].apply(lambda x:dict_contig_to_species[x.split(' ')[0]])
df_part3['species2']=df_part3['6'].apply(lambda x:dict_contig_to_species[x.split(' ')[0]])
print(df_part3)
# df_part3_1=df_part3[df_part3['genus1']==df_part3['genus2']]
# df_part3_1['genus']=df_part3_1['genus1']
#
df_part3_2=df_part3[df_part3['species1']==df_part3['species2']]
df_part3_2['species']=df_part3_2['species1']
# df_part3_2=df_part3_2[['genus','species']]
#
#
df_part3_3=df_part3[df_part3['species1']!=df_part3['species2']]
df_part3_3['species']=''
df_part1=df_part1[['species']]
df_part2=df_part2[['species']]
df_part3_2=df_part3_2[['species']]
df_part3_3=df_part3_3[['species']]
#
df_last=pd.concat([df_part1,df_part2,df_part3_2,df_part3_3])
print(df_last)
df2=df_last.copy(deep=True)
## 统计属
# value1=pd.value_counts(df2['genus'])
# print(value1.index)
# dict_value1=dict(zip(value1.index,value1.values))
# print(dict_value1)
#
# df_value1=pd.DataFrame.from_dict(dict_value1,orient='index',columns=['count'])
# df_value1=df_value1.rename_axis('genus').reset_index()
# print(df_value1)
## 统计种
value2=pd.value_counts(df2['species'])
print(value2.index)
dict_value2=dict(zip(value2.index,value2.values))
print(dict_value2)
df_value2=pd.DataFrame.from_dict(dict_value2,orient='index',columns=['count'])
df_value2=df_value2.rename_axis('species').reset_index()
print(df_value2)
# df_value1.to_csv(outputpath1,index=None,sep='\t')
df_value2.to_csv(outputpath2,index=None,sep='\t')
if __name__=='__main__':
count_genus_species()
20230509_blast.10_genome.count.revise
##
import pandas as pd
import argparse
parser=argparse.ArgumentParser(description='input error !')
parser.add_argument('-ip1','--inputpath1',required=True,help='input like NL230413-1_S113.blastn.xls')
parser.add_argument('-r','--reference',required=True,help='taxmap_slv_ssu_ref_nr_138.1.xls')
parser.add_argument('-op1','--outputpath1',required=True,help='NL230413-1_S113.blastn.result.xls')
# parser.add_argument('-op2','--outputpath2',required=True,help='NL230413-1_S113.blastn.result.genus.xls')
parser.add_argument('-op3','--outputpath3',required=True,help='NL230413-1_S113.blastn.result.species.xls')
# inputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\NL230413-1_S113.blastn.xls'
# reference=r'C:\Users\Administrator\Desktop\自己测试捕获\taxmap_slv_ssu_ref_nr_138.1.xls'
# outputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\NL230413-1_S113.blastn.result.xls'
args = parser.parse_args()
inputpath1=args.inputpath1
reference=args.reference
outputpath1=args.outputpath1
# outputpath2=args.outputpath2
outputpath3=args.outputpath3
# df1=pd.read_csv(inputpath1,sep='\t',header=None)
df_ref=pd.read_csv(reference,sep='\t')
# dict_accession_to_genus = dict(zip(df_ref['primaryAccession'], df_ref['genus']))
dict_accession_to_species = dict(zip(df_ref['accession'], df_ref['species']))
# with open(inputpath1,'r') as f2,open(outputpath1,'w') as f3:
# for line in f2:
# line=line.strip('\n')
# list_in=line.split('\t')
# f3.write(line+'\t'+dict_accession_to_species[list_in[1]]+'\n')
# list_need=[]
dict_need={}
dict_need1={}
with open(inputpath1,'r') as f2,open(outputpath1,'w') as f3:
for line in f2:
line=line.strip('\n')
list_in=line.split('\t')
list_in.append(dict_accession_to_species[list_in[1]])
if list_in[0] in dict_need:
dict_need[list_in[0]].append(list_in[-1])
else:
dict_need[list_in[0]]=[list_in[-1]]
# list1=[1,23,4,1,1,4,45,4,4,6]
# print(max(list1,key=list1.count))
for i in dict_need:
# dict_need1[i]=max(dict_need[i],key=dict_need[i].count)
dict_need1[i] = dict_need[i][0]
species_value_counts = pd.value_counts(list(dict_need1.values()))
dict_species_value_counts = dict(zip(species_value_counts.index, species_value_counts.values))
print(dict_species_value_counts)
df_species_value_counts = pd.DataFrame.from_dict(dict_species_value_counts, orient='index', columns=['count'])
df_species_value_counts = df_species_value_counts.rename_axis('species').reset_index()
print(df_species_value_counts)
df_species_value_counts.to_csv(outputpath3,sep='\t',index=None)
# f3.write(line+'\t'+dict_accession_to_species[list_in[1]]+'\n')
#
#
#
#
#
# # df1['genus']=df1[1].apply(lambda x:dict_accession_to_genus[x])
# # df1['species']=df1[1].apply(lambda x:dict_accession_to_species[x])
# df1=pd.read_csv(outputpath1,sep='\t',header=None)
# print(df1)
#
# list_contig=list(set(df1[0].tolist()))
# list_contig.sort(key=list(set(df1[0].tolist())).index)
# print(len(list_contig))
#
# # dict_contig_to_genus={}
# dict_contig_to_species={}
# for item in list_contig:
# df_part=df1[df1[0]==item].copy()
# ## 种
# species_value_counts=pd.value_counts(df_part[12].tolist())
# dict_species_value_counts=dict(zip(species_value_counts.index,species_value_counts.values))
# print(dict_species_value_counts)
# df_species_value_counts=pd.DataFrame.from_dict(dict_species_value_counts,orient='index',columns=['count'])
# df_species_value_counts=df_species_value_counts.rename_axis('species').reset_index()
# print(df_species_value_counts)
# dict_contig_to_species[item]=df_species_value_counts.loc[0,'species']
#
# # print(dict_contig_to_genus)
# print(dict_contig_to_species)
#
# with open(outputpath1,'w') as f1:
# f1.write('contig_name\tspecies\n')
# for i in dict_contig_to_species:
# f1.write(i+'\t'+dict_contig_to_species[i]+'\n')
#
# df_last=pd.read_csv(outputpath1,sep='\t')
#
#
# ##种
# species_value_counts = pd.value_counts(df_last['species'].tolist())
# dict_species_value_counts = dict(zip(species_value_counts.index, species_value_counts.values))
# print(dict_species_value_counts)
# df_species_value_counts = pd.DataFrame.from_dict(dict_species_value_counts, orient='index', columns=['count'])
# df_species_value_counts = df_species_value_counts.rename_axis('species').reset_index()
# print(df_species_value_counts)
# df_species_value_counts.to_csv(outputpath3,sep='\t',index=None)
20230508_匹配基因组的名称和染色体id
import os
inputpath1=r'/data/lijing/assemble_capture_test/species_whole_genome/ZymoBIOMICS.STD.refseq.v2/Genomes'
outputpath1=r'/data/lijing/assemble_capture_test/species_whole_genome/10_species.accession.tax.xls'
list1=os.listdir(inputpath1)
with open(outputpath1,'w') as f2:
f2.write('accession\tspecies\n')
for i in list1:
with open(inputpath1+'/'+i,'r') as f1:
for line in f1:
line=line.strip('\n')
if line.startswith('>'):
f2.write('{}'.format(line.split(' ')[0][1:]+'\t'+i.split('_')[0]+' '+i.split('_')[1]+'\n'))
20230509_拆分tax分为genus和species
import pandas as pd
df1=pd.read_csv(r'D:\pycharm\project\20230506_使用\taxonomy.tsv',sep='\t')
print(df1)
df1.columns=['primaryAccession','Taxon']
df1['genus']=df1['Taxon'].apply(lambda x:x.split('g__')[-1].split(';')[0])
df1['species']=df1['Taxon'].apply(lambda x:x.split('s__')[-1].replace('_',' '))
print(df1)
df1.to_csv(r'D:\pycharm\project\20230506_使用\taxonomy.change.tsv',index=None,sep='\t')
# df1=pd.read_csv(r'D:\pycharm\project\20230506_使用\taxonomy.tsv',sep='\t')
# print(df1)
# df1.columns=['accession','Taxon']
# df1['genus']=df1['Taxon'].apply(lambda x:x.split('g__')[-1].split(';')[0])
# df1['species']=df1['Taxon'].apply(lambda x:x.split('s__')[-1].replace('_',' '))
# print(df1)