- 代码1
import pandas as pd
# df5.to_csv('%s/tmp_blastn_match_taxid_lineage_add_kgs' % outputpath,index=False, sep='\t')
df5=pd.read_csv(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs.txt',sep='\t')
df1=df5
# print(df1['% identity'].value_counts(ascending=False))
# 检查比对一致性是否大于80
# for i in range(len(df1['% identity'])):
# if df1['% identity'][i] > 80:
# i += 1
# else:
# print("unmeet")
# break
print(df1)
dict1=df1[["Query id", "species"]].set_index("Query id").to_dict(orient='dict')["species"]
dict2=df1[["Query id", "species"]].set_index("Query id").to_dict()["species"]
print(dict1)
print(dict2)
list1 = df1['Query id'].unique().tolist()
list3 = []
for i in range(len(list1)):
list2 = df1[df1['Query id'] == list1[i]]['species'].tolist()
# print(list2)
# m=m+1
# print(m)
# if len(set(list2))==1 怎么判定一个List里面的元素是不是全部一样
for j in range(1, len(list2)):
if str(list2[j]).split(' ', 2)[0] != str(list2[0]).split(' ', 2)[0]:
# print("不一样!")
# break
# print(list1[i])
list3.append(list1[i])
break
else:
# print("一样")
j = j + 1
# print(list3)
for i in range(len(list3)):
df1 = df1[~df1['Query id'].isin([str(list3[i])])] # 删除df表中包含指定字符串的行数据
# print(df1)
df2 = df1.reset_index(drop=True) # 重建索引
# print(df2)
df2.to_csv('%s/tmp_blastn_match_taxid_lineage_add_kgs_delrepeat' % outputpath,index=None, sep='\t')
2.代码2
# import pandas as pd
# df5=pd.read_csv(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs.txt',sep='\t')
# df1=df5
A = {}
with open(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs.txt') as f:
f.readline()
for line in f:
line = line.strip('\n')
t_list = line.split('\t')
head = t_list[2]
species = t_list[-1]
if head in A:
B = A[head]
# print(B)
if species in B:
A[head][species] += 1
else:
A[head][species] = 1
else:
A[head] = {species: 1}
# print(head)
# print(A[head])
# print(A)
# print(A)
# print(A.values())
# print(A.keys())
# # print(A.keys()[0])
# print(len(A)) #多少key
# print("___________________")
print(list(A.keys()))
for key,value in list(A.items()):
# print(value)
# print(len(value))
# print(key)
if len(value)==1:
# print(A[key])
del A[key]
# print("___________________")
print(A)
# print(len(A))
print(list(A.keys()))
# for i in range(len(list(A.keys()))):
# df1 = df1[~df1['Query id'].isin([str(list(A.keys())[i])])]
#
# df2 = df1.reset_index(drop=True)
# df2.to_csv(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs_delrep.txt' ,index=None, sep='\t')
with open(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs.txt') as oldfile, open(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs_deldelrept.txt', 'w') as newfile:
for line in oldfile:
if not any(bad_word in line for bad_word in list(A.keys())):
newfile.write(line)
# lines_seen=set()
# with open(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs_deldelrept.txt') as f,open(r'2021-10\K200005137_L01_126_nt_virus_blastn_match_taxid_lineage_add_kgs_deldelrept1.txt', 'a+') as outfile:
# for line in f:
# if line not in lines_seen:
# outfile.write(line)
# lines_seen.add(line)