1.比较bowtie2和bmtagger的种属差异
import pandas as pd
import random
inputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\NL230601-1B_S45_L002.2.classify.last.xls'
inputpath2=r'C:\Users\Administrator\Desktop\自己测试捕获\20230619\第三张图-new20230629\宏基因组\22-12373F.2.classify.last.xls'
df1=pd.read_csv(inputpath1,sep='\t')
df2=pd.read_csv(inputpath2,sep='\t')
print(df1)
##genus
df1_genus=df1[(df1['count'] >=2) & (df1['G/S']=='species')].copy().reset_index(drop=True)
list1_genus=df1_genus['taxid'].tolist()
dict1=dict(zip(df1_genus['taxid'],df1_genus['Ename']))
# print(list1_genus)
df2_genus=df2[(df2['count'] >=2) & (df2['G/S']=='species')].copy().reset_index(drop=True)
list2_genus=df2_genus['taxid'].tolist()
dict2=dict(zip(df2_genus['taxid'],df2_genus['Ename']))
# print(list2_genus)
list_1_to_2=list(set(list2_genus).difference(set(list1_genus)))
print(list_1_to_2)
# list_2_to_1=list(set(list1_genus).difference(set(list2_genus)))
# print(list_2_to_1)
list_intersection=list(set(list2_genus).intersection(set(list1_genus)))
print(list_intersection)
# list_intersection_10=random.sample(list_intersection,10)
# print(list_intersection_10)
list_intersection_10=[74426,28116,2528203,48074,2025876,208479,821,29442,2492837,357276]
# list_1_to_2_10=random.sample(list_1_to_2,10)
# print(list_1_to_2_10)
list_1_to_2_10=[1784,1423,2023057,83656,2005458,1357916,247,1969841,562,2584122]
inputpath3=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\bmtagger.kraken2'
inputpath4=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\bowtie2.kraken2'
df3=pd.read_csv(inputpath3,sep='\t',header=None)
df_add=pd.DataFrame(columns=[0])
for i in list_intersection_10:
df3_part1=df3[df3[2]==i].copy().reset_index(drop=True)
df3_part1['Ename']=df3_part1[2].apply(lambda x:dict1[x])
# print(df3_part1)
df_add=df_add.append(df3_part1)
print(df_add)
df_add.to_csv(r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\intersection_10.kraken2.xls',sep='\t',index=None,header=None)
df4=pd.read_csv(inputpath4,sep='\t',header=None)
df_add=pd.DataFrame(columns=[0])
for i in list_1_to_2_10:
df4_part1=df4[df4[2]==i].copy().reset_index(drop=True)
df4_part1['Ename']=df4_part1[2].apply(lambda x:dict2[x])
# print(df3_part1)
df_add=df_add.append(df4_part1)
print(df_add)
df_add.to_csv(r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.kraken2.xls',sep='\t',index=None,header=None)
2.extract_seq
## [83561, 74426, 28116, 214856, 2528203, 48074, 1307, 2025876, 208479, 821]
## [1969841, 1784, 277, 1423, 585, 63363, 646637, 53462, 732, 1580]
import pyfastx
import pandas as pd
inputpath1=r'/data/lijing/Cust_NAD_WGS/NL230601-1B_S45_L002_R1_001.fastq.gz'
inputpath2=r'/data/lijing/Cust_NAD_WGS/NL230601-1B_S45_L002_R2_001.fastq.gz'
inputpath3=r'/data/lijing/20230629_test_Cust_NAD_WGS/intersection_10.kraken2.xls'
inputpath4=r'/data/lijing/20230629_test_Cust_NAD_WGS/1_to_2_10.kraken2.xls'
# inputpath3=r'/data/lijing/20230629_test_Cust_NAD_WGS/NL230601-1B_S45_L002.out'
#
# outputpath1=r'/data/lijing/20230629_test_Cust_NAD_WGS/1_NL230601-1B_S45_L002.human.fq.1.gz'
# outputpath2=r'/data/lijing/20230629_test_Cust_NAD_WGS/1_NL230601-1B_S45_L002.human.fq.2.gz'
## seq
fq1 = pyfastx.Fastx(inputpath1, comment=True)
dict1 = {}
for name, seq, qual, comment in fq1:
# print(name)
# print(seq)
# print(qual)
# print(comment)
dict1[name] = [seq, qual, comment]
fq2 = pyfastx.Fastx(inputpath2, comment=True)
dict2 = {}
for name, seq, qual, comment in fq2:
dict2[name] = [seq, qual, comment]
df3=pd.read_csv(inputpath3,sep='\t',header=None)
with open(r'/data/lijing/20230629_test_Cust_NAD_WGS/intersection_10.kraken2.fa','w') as f1:
for i in range(len(df3)):
f1.write('>'+df3.loc[i,1]+'_'+str(int(df3.loc[i,2]))+':'+str(df3.loc[i,5].replace(' ','_'))+':r1'+'\n'+dict1[df3.loc[i,1]][0]+'\n'+\
'>'+df3.loc[i,1]+'_'+str(int(df3.loc[i,2]))+':'+str(df3.loc[i,5].replace(' ','_'))+':r2'+'\n'+dict2[df3.loc[i,1]][0]+'\n')
df4=pd.read_csv(inputpath4,sep='\t',header=None)
with open(r'/data/lijing/20230629_test_Cust_NAD_WGS/1_to_2_10.kraken2.fa','w') as f1:
for i in range(len(df4)):
f1.write('>'+df3.loc[i,1]+'_'+str(int(df4.loc[i,2]))+':'+str(df4.loc[i,5].replace(' ','_'))+':r1'+'\n'+dict1[df4.loc[i,1]][0]+'\n'+\
'>'+df3.loc[i,1]+'_'+str(int(df4.loc[i,2]))+':'+str(df4.loc[i,5].replace(' ','_'))+':r2'+'\n'+dict2[df4.loc[i,1]][0]+'\n')
3.针对在线blast比对-拆分json比对文件
import json
# dict_part={}
with open(r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.json','r') as f1,open(r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.xls','w') as f2:
f2.write('query_title\tquery_len\taccession\ttitle\ttaxid\tsciname\tidentity\tquery_from\tquery_to\thit_from\thit_to\talign_len\tgaps\tqseq\thseq\tmidline\n')
f1.readline()
f1.readline()
line_part=''
for line in f1:
line=line.strip('\n')
if line.startswith(','):
# line_part +=']}'
# print(line_part)
dict_part=json.loads(line_part)
# print(dict_part['report']['results']['search'])
# print(len(dict_part['report']['results']['search']['hits']))
# print(dict_part['report']['results']['search']['hits'][0])
#
# print(dict_part['report']['results']['search']['hits'][0]['description'][0])
# print(dict_part['report']['results']['search']['hits'][0]['description'][0]['accession'])
# print(dict_part['report']['results']['search']['hits'][0]['description'][0]['title'])
#
# print(dict_part['report']['results']['search']['hits'][0]["hsps"][0]['title'])
# print(dict_part['report']['results']['search']['hits'][1])
for i in range(len(dict_part['report']['results']['search']['hits'])):
for j in range(len(dict_part['report']['results']['search']['hits'][i]["hsps"])):
f2.write(dict_part['report']['results']['search']['query_title']+'\t'+
str(dict_part['report']['results']['search']['query_len'])+'\t'+
dict_part['report']['results']['search']['hits'][i]['description'][0]["accession"]+'\t'+
dict_part['report']['results']['search']['hits'][i]['description'][0]["title"]+'\t'+
str(dict_part['report']['results']['search']['hits'][i]['description'][0]["taxid"]) + '\t' +
dict_part['report']['results']['search']['hits'][i]['description'][0]["sciname"] + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["identity"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_from"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_to"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_from"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_to"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["align_len"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["gaps"]) + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["qseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["midline"]+'\n')
# break
# f2.write(dict_part['report'])
line_part=''
elif line.startswith(']'):
line_part=line_part[:-1]+'}'
# print(line_part)
dict_part=json.loads(line_part)
print(type(dict_part))
for i in range(len(dict_part['report']['results']['search']['hits'])):
for j in range(len(dict_part['report']['results']['search']['hits'][i]["hsps"])):
f2.write(dict_part['report']['results']['search']['query_title']+'\t'+
str(dict_part['report']['results']['search']['query_len'])+'\t'+
dict_part['report']['results']['search']['hits'][i]['description'][0]["accession"]+'\t'+
dict_part['report']['results']['search']['hits'][i]['description'][0]["title"]+'\t'+
str(dict_part['report']['results']['search']['hits'][i]['description'][0]["taxid"]) + '\t' +
dict_part['report']['results']['search']['hits'][i]['description'][0]["sciname"] + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["identity"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_from"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_to"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_from"])+'\t'+
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_to"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["align_len"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["gaps"]) + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["qseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["midline"]+'\n')
else:
line_part += line.strip()
# line_part +=line
# print(dict_part)
4.针对在线blast比对-判断bowtie2中有但bmtagger没有的10物种中假阳性(可能为人源序列或其他)的比例
import pandas as pd
inputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.xls'
df1=pd.read_csv(inputpath1,sep='\t')
df1['query_title_id']=df1['query_title'].apply(lambda x:x.split('_')[0])
df1['query_title_taxid']=df1['query_title'].apply(lambda x:int(x.split('_')[1].split(':')[0]))
print(df1)
list1=list(set(df1['query_title'].tolist()))
print(list1)
for i in list1:
df1_part=df1[df1['query_title']==i].copy().iloc[0:2,:].reset_index(drop=True)
# print(df1_part)
df1_part=df1_part[df1_part['query_title_taxid']==df1_part['taxid']]
# df1_part=df1_part[df1_part['qseq']==df1_part['hseq']].copy()
# print(df1_part)
if len(df1_part)==2:
print(df1_part)
5.blast_alignment-extract_json_result
import json
# dict_part={}
with open(r'intersection_10.kraken2.blastn.json', 'r') as f1, open(
r'intersection_10.kraken2.blastn.xls', 'w') as f2:
f2.write(
'query_title\tquery_len\taccession\ttitle\tidentity\tquery_from\tquery_to\thit_from\thit_to\talign_len\tgaps\tqseq\thseq\tmidline\n')
f1.readline()
f1.readline()
line_part = ''
for line in f1:
line = line.strip('\n')
if line.startswith(','):
# line_part +=']}'
# print(line_part)
dict_part = json.loads(line_part)
# print(dict_part['report']['results']['search'])
# print(len(dict_part['report']['results']['search']['hits']))
# print(dict_part['report']['results']['search']['hits'][0])
#
# print(dict_part['report']['results']['search']['hits'][0]['description'][0])
# print(dict_part['report']['results']['search']['hits'][0]['description'][0]['accession'])
# print(dict_part['report']['results']['search']['hits'][0]['description'][0]['title'])
#
# print(dict_part['report']['results']['search']['hits'][0]["hsps"][0]['title'])
# print(dict_part['report']['results']['search']['hits'][1])
for i in range(len(dict_part['report']['results']['search']['hits'])):
for j in range(len(dict_part['report']['results']['search']['hits'][i]["hsps"])):
f2.write(dict_part['report']['results']['search']['query_title'] + '\t' +
str(dict_part['report']['results']['search']['query_len']) + '\t' +
dict_part['report']['results']['search']['hits'][i]['description'][0]["accession"] + '\t' +
dict_part['report']['results']['search']['hits'][i]['description'][0]["title"] + '\t' +
# str(dict_part['report']['results']['search']['hits'][i]['description'][0][
# "taxid"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["identity"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_from"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_to"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_from"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_to"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["align_len"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["gaps"]) + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["qseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["midline"] + '\n')
# break
# f2.write(dict_part['report'])
line_part = ''
elif line.startswith(']'):
line_part = line_part[:-1] + '}'
# print(line_part)
dict_part = json.loads(line_part)
print(type(dict_part))
for i in range(len(dict_part['report']['results']['search']['hits'])):
for j in range(len(dict_part['report']['results']['search']['hits'][i]["hsps"])):
f2.write(dict_part['report']['results']['search']['query_title'] + '\t' +
str(dict_part['report']['results']['search']['query_len']) + '\t' +
dict_part['report']['results']['search']['hits'][i]['description'][0]["accession"] + '\t' +
dict_part['report']['results']['search']['hits'][i]['description'][0]["title"] + '\t' +
# str(dict_part['report']['results']['search']['hits'][i]['description'][0][
# "taxid"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["identity"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_from"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["query_to"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_from"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hit_to"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["align_len"]) + '\t' +
str(dict_part['report']['results']['search']['hits'][i]["hsps"][j]["gaps"]) + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["qseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["hseq"] + '\t' +
dict_part['report']['results']['search']['hits'][i]["hsps"][j]["midline"] + '\n')
else:
line_part += line.strip()
6.determine_accurate
import pandas as pd
inputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.kraken2.blastn.xls'
inputpath2=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.kraken2.fa'
outputpath1=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.kraken2.blastn.last.top1.xls'
## outputpath2=r'C:\Users\Administrator\Desktop\自己测试捕获\20230704_比较bowtie2和bwtagger\1_to_2_10.kraken2.blastn.inter2.xls'
# inputpath1=r'/data/lijing/20230629_test_Cust_NAD_WGS/intersection_10.kraken2.blastn.xls'
# inputpath2=r'/data/lijing/20230629_test_Cust_NAD_WGS/intersection_10.kraken2.fa'
# outputpath1=r'/data/lijing/20230629_test_Cust_NAD_WGS/intersection_10.kraken2.blastn.last.xls'
# ## outputpath2=r'/data/lijing/20230629_test_Cust_NAD_WGS/intersection_10.kraken2.blastn.inter2.xls'
list_all=[]
dict_all={}
with open(inputpath2,'r') as f3:
for line in f3:
line=line.strip('\n')
# list_line=line.split(':')
if line.startswith('>'):
list_all.append(line.split('_',1)[1].split(':')[1].replace('_',' '))
dict_all[line.split('_',1)[1].split(':')[1].replace('_',' ')]=line.split('_',1)[1].split(':')[0]
print(dict_all)
value_counts_all=pd.value_counts(list_all)
dict_value_count_all=dict(zip(value_counts_all.index,value_counts_all.values))
print(dict_value_count_all)
df_value_count_all=pd.DataFrame.from_dict(dict_value_count_all,orient='index',columns=['raw_reads'])
df_value_count_all=df_value_count_all.rename_axis('Ename').reset_index()
with open(inputpath1,'r') as f1,open(outputpath1,'w') as f2:
f1.readline()
f2.write('query_title\tdescription\tequal\n')
for line in f1:
line=line.strip('\n')
list_line=line.split('\t')
if list_line[0].split(':')[-2].replace('_',' ') in list_line[3]:
f2.write(list_line[0]+'\t'+list_line[3]+'\t'+'yes'+'\n')
elif list_line[0].split(':')[-2].replace('_',' ')=='[Clostridium] bolteae' and 'Enterocloster bolteae' in list_line[3]:
f2.write(list_line[0] + '\t' + list_line[3] + '\t' + 'yes' + '\n')
else:
f2.write(list_line[0] + '\t' + list_line[3] + '\t' + 'no' + '\n')
## 第3次写
df2=pd.read_csv(outputpath1,sep='\t')
df2['Ename']=df2['query_title'].apply(lambda x:x.split('_', 1)[1].split(':')[1].replace('_',' '))
df2=df2[df2['equal']=='yes'].copy().reset_index(drop=True)
print(df2)
# df2=df2.groupby(["query_title"]).head(1).reset_index(drop=True)
df2=df2.groupby(["query_title"]).head(2).reset_index(drop=True)
print(df2)
print("**************"*5)
value_counts_have=pd.value_counts(df2['query_title'])
dict_value_count_have=dict(zip(value_counts_have.index,value_counts_have.values))
print(dict_value_count_have)
df_value_count_have=pd.DataFrame.from_dict(dict_value_count_have,orient='index',columns=['counts'])
df_value_count_have=df_value_count_have.rename_axis('query_title').reset_index()
# df_value_count_have=df_value_count_have[df_value_count_have['counts']==1]
df_value_count_have=df_value_count_have[df_value_count_have['counts']==2]
print(df_value_count_have)
list_have1=list(set(df_value_count_have['query_title'].tolist()))
df_value_count_have['Ename']=df_value_count_have['query_title'].apply(lambda x:x.split('_', 1)[1].split(':')[1].replace('_',' '))
# df2=df2[df2['query_title'].isin(list_have1)]
value_counts_have=pd.value_counts(df_value_count_have['Ename'])
dict_value_count_have=dict(zip(value_counts_have.index,value_counts_have.values))
print(dict_value_count_have)
df_value_count_have=pd.DataFrame.from_dict(dict_value_count_have,orient='index',columns=['real_reads'])
df_value_count_have=df_value_count_have.rename_axis('Ename').reset_index()
print(df_value_count_have)
df_merge=pd.merge(df_value_count_all,df_value_count_have,how='outer',on=['Ename'])
df_merge.fillna(0,inplace=True)
print(df_merge)
df_merge['taxid']=df_merge['Ename'].apply(lambda x:dict_all[x])
df_merge=df_merge[['Ename','taxid','raw_reads','real_reads']]
df_merge.to_csv(outputpath1,sep='\t',index=None,header=True)
# ## 第二次写
# list_have=[]
# with open(outputpath1,'r') as f3,open(outputpath2,'w') as f4:
# f3.readline()
# f4.write('query_title\tdescription\tequal\n')
# for line in f3:
# line = line.strip('\n')
# list_line = line.split('\t')
#
# if list_have.count(list_line[0]) >= 2:
# pass
# else:
# f4.write(line+'\n')
# list_have.append(list_line[0])
#
# df2=pd.read_csv(outputpath2,sep='\t')
# df2['Ename']=df2['query_title'].apply(lambda x:x.split('_', 1)[1].split(':')[1].replace('_',' '))
# df2=df2[df2['equal']=='yes'].copy().reset_index(drop=True)
# print(df2)
#
# value_counts_have=pd.value_counts(df2['query_title'])
# dict_value_count_have=dict(zip(value_counts_have.index,value_counts_have.values))
# print(dict_value_count_have)
# df_value_count_have=pd.DataFrame.from_dict(dict_value_count_have,orient='index',columns=['counts'])
# df_value_count_have=df_value_count_have.rename_axis('query_title').reset_index()
#
# df_value_count_have=df_value_count_have[df_value_count_have['counts']==2]
# print(df_value_count_have)
#
#
# list_have1=list(set(df_value_count_have['query_title'].tolist()))
# df_value_count_have['Ename']=df_value_count_have['query_title'].apply(lambda x:x.split('_', 1)[1].split(':')[1].replace('_',' '))
# # df2=df2[df2['query_title'].isin(list_have1)]
# value_counts_have=pd.value_counts(df_value_count_have['Ename'])
# dict_value_count_have=dict(zip(value_counts_have.index,value_counts_have.values))
# print(dict_value_count_have)
# df_value_count_have=pd.DataFrame.from_dict(dict_value_count_have,orient='index',columns=['real_reads'])
# df_value_count_have=df_value_count_have.rename_axis('Ename').reset_index()
# print(df_value_count_have)
#
# df_merge=pd.merge(df_value_count_all,df_value_count_have,how='outer',on=['Ename'])
#
# df_merge.fillna(0,inplace=True)
# print(df_merge)
# df_merge['taxid']=df_merge['Ename'].apply(lambda x:dict_all[x])
# df_merge=df_merge[['Ename','taxid','raw_reads','real_reads']]
# df_merge.to_csv(outputpath1,sep='\t',index=None,header=True)
## 第一次写
# df1=pd.read_csv(outputpath1,sep='\t')
#
# list1=list(set(df1['query_title'].tolist()))
# print(list1)
#
# list_have=[]
# for i in list1:
# df1_part=df1[df1['query_title']==i].copy().iloc[0:2,:]
# df1_part=df1_part[df1_part['equal']=='yes'].reset_index(drop=True)
#
# if len(df1_part)==2:
# # print(df1_part)
# # list_have.append(df1_part.loc[0,'query_title'].split('_',1)[1].rsplit(':',1)[0])
# list_have.append(df1_part.loc[0, 'query_title'].split('_', 1)[1].split(':')[1].replace('_',' '))
# print(list_have)
#
# value_counts_have=pd.value_counts(list_have)
# dict_value_count_have=dict(zip(value_counts_have.index,value_counts_have.values))
# print(dict_value_count_have)
# df_value_count_have=pd.DataFrame.from_dict(dict_value_count_have,orient='index',columns=['real_reads'])
# df_value_count_have=df_value_count_have.rename_axis('Ename').reset_index()
# print(df_value_count_have)
#
# df_merge=pd.merge(df_value_count_all,df_value_count_have,how='outer',on=['Ename'])
#
# df_merge.fillna(0,inplace=True)
# print(df_merge)
# df_merge.to_csv(outputpath1,sep='\t',index=None,header=True)
7.extract_seq_base_human_id
import pyfastx
import gzip
import argparse
parser=argparse.ArgumentParser(description='input error !')
parser.add_argument('-ip1','--inputpath1',required=True,help='input like NL230601-1B_S45_L002.r1.paired.fq.gz')
parser.add_argument('-ip2','--inputpath2',required=True,help='input like NL230601-1B_S45_L002.r2.paired.fq.gz')
parser.add_argument('-ip3','--inputpath3',required=True,help='input like NL230601-1B_S45_L002.out')
parser.add_argument('-op1','--outputpath1',required=True,help='NL230601-1B_S45_L002.human.fq.1.gz')
parser.add_argument('-op2','--outputpath2',required=True,help='NL230601-1B_S45_L002.human.fq.2.gz')
args = parser.parse_args()
inputpath1=args.inputpath1
inputpath2=args.inputpath2
inputpath3=args.inputpath3
outputpath1=args.outputpath1
outputpath2=args.outputpath2
# inputpath1=r'/data/lijing/20230629_test_Cust_NAD_WGS/bmtagger_minikraken2_test/NL230601-1B_S45_L002.r1.paired.fq.gz'
# inputpath2=r'/data/lijing/20230629_test_Cust_NAD_WGS/bmtagger_minikraken2_test/NL230601-1B_S45_L002.r2.paired.fq.gz'
#
# inputpath3=r'/data/lijing/20230629_test_Cust_NAD_WGS/NL230601-1B_S45_L002.out'
#
# outputpath1=r'/data/lijing/20230629_test_Cust_NAD_WGS/1_NL230601-1B_S45_L002.human.fq.1.gz'
# outputpath2=r'/data/lijing/20230629_test_Cust_NAD_WGS/1_NL230601-1B_S45_L002.human.fq.2.gz'
## seq
fq1 = pyfastx.Fastx(inputpath1, comment=True)
dict1 = {}
for name, seq, qual, comment in fq1:
# print(name)
# print(seq)
# print(qual)
# print(comment)
dict1[name] = [seq, qual, comment]
fq2 = pyfastx.Fastx(inputpath2, comment=True)
dict2 = {}
for name, seq, qual, comment in fq2:
dict2[name] = [seq, qual, comment]
## human id
list_human_id=[]
with open(inputpath3,'r') as f1:
for line in f1:
line=line.strip('\n')
list_human_id.append(line)
f1.close()
# print(list_human_id)
# dict_remove_human1={ item:dict1[item] for item in dict1 if item not in list_human_id}
# dict_remove_human2={ item:dict2[item] for item in dict2 if item not in list_human_id}
for i in list_human_id:
del dict1[i]
dict_remove_human1=dict1
# print(dict_remove_human1)
for i in list_human_id:
del dict2[i]
dict_remove_human2=dict2
with gzip.open(outputpath1,'wb') as f1:
for i in dict_remove_human1:
line='@'+i+' '+dict_remove_human1[i][2]+'\n'+dict_remove_human1[i][0]+'\n+\n'+dict_remove_human1[i][1]+'\n'
line=line.encode(encoding='utf-8')
f1.write(line)
with gzip.open(outputpath2,'wb') as f2:
for i in dict_remove_human2:
line='@'+i+' '+dict_remove_human2[i][2]+'\n'+dict_remove_human2[i][0]+'\n+\n'+dict_remove_human2[i][1]+'\n'
line=line.encode(encoding='utf-8')
f2.write(line)