这本质上还是个文件合并的问题
这是sample文件
这是database文件
要求是把database文件的第四行追加到sample文件后面。限制条件:
- 区间要有重合。也就是说start和end这两列的数字表示每个样本的区间
- 变异类型要一致,也就是type这一行要一样。
最后做出来要像这样:
- 最后追加一行
- 有符合限制条件的就追加上去,逗号隔开,没有就空着
# 循环遍历的判定方法
import getpass,os
# 这个寻找用户目录的函数非常棒。以后都用它!
usr = getpass.getuser()
os.chdir('c:/Users/' + usr + '/Desktop')
def read1(file):
dictt = {}
fragment = []
f = open(file,"r")
try:
line = f.readlines()[1:]
for i in line:
fragment.append(i.strip().split("\t"))
finally:
f.close()
for arr in fragment:
dictt[(arr)[0]+"\t"+(arr)[1]+"\t"+(arr)[2]+"\t"+(arr)[3]+"\t"+(arr)[4]] = [arr[4],(arr[2],arr[3])]
return dictt
sample1 = read1('Test2_sample')
def read2(file):
dictt = {}
fragment = []
f = open(file,"r")
try:
line = f.readlines()[1:]
for i in line:
fragment.append(i.strip().split("\t"))
finally:
f.close()
for arr in fragment:
dictt[(arr)[3]] = [(arr)[4],(arr[1],arr[2])]
return dictt
database = read2('Test2_database')
output = {}
for k,v in sample1.items():
output[k] = ' '
# limit可以调节重叠的范围,默认为1,只要至少重合的区间是1就算重合
limit = 1
for k1,v1 in sample1.items():
for k2,v2 in database.items():
if v1[0] == v2[0]:
v2_0 = int(v2[1][0])
v2_1 = int(v2[1][1])
v1_0 = int(v1[1][0])
v1_1 = int(v1[1][1])
in_ = 0 # 判断重合部分
for i in range(v1_0,v1_1):
if i in range(v2_0,v2_1):
in_ += 1
#print (in_)
if in_ >= limit:
output[k1] += k2+","
for k,v in output.items(): # 去掉多余的逗号
output[k] = output[k][:-1]
print(output)
f = open('2.txt','w')
f.write('Samples\tChr\tStar\tEnd\tType\tDgv_ann\n')
for k,v in output.items():
f.write((k) + '\t')
f.write((v)+ '\n')
f.close()
下面是用numpy来解答,代码量减半!
- 读取功能十分强大,直接把文件做成多维数组
- 读取时和操作时要特别注意数据类型 dtype=
- 对多维数组可以做非常灵活的切片操作
- 我还不知道怎么对数组中所有元素批量操作,比如在所有元素后面都加个‘\t’?
import os
import getpass
user = getpass.getuser()
os.chdir('c:/Users/' + user+ '/Documents')
import numpy as np
database = np.loadtxt('Test2_database',dtype=bytes,delimiter='\t',skiprows=1)
sample = np.loadtxt('Test2_sample',dtype=bytes,skiprows=1)
limit = 0
def combine(r):
Dgv_ann = ''
s = sample[r]
for d in database:
if s[-1] in d:
count = 0
for i in range(int(s[-3]),int(s[-2])):
if i in range(int(d[-4]),int(d[-3])):
count += 1
if count > limit:
Dgv_ann += (str(d[-2].decode('UTF-8'))+',')
return Dgv_ann
#print(combine(1))
output = []
sp = sample.astype(np.str).tolist()
for i in sp:
output.append('\t'.join(i))
for i in range(len(sp)):
output[i] += ('\t' + combine(i)[:-1] )
with open('2.txt','w') as f:
f.write('Samples\tChr\tStar\tEnd\tType\tDgv_ann\n')
for i in output:
f.write(str(i)+'\n')