import re
with open("../examples/ros_bio25_LONG.txt") as f:
file = f.readlines()
#提取fasta文件
table = {}
for line in file:
line = re.sub(r'\n', '', line)
m = re.match('^>.*', line)
if m:
name = m.group()
table[name] = ''
else:
table[name] += line
seq = []
for value in table.values():
seq.append(value)
#寻找重叠序列
sequence = []
for n in range(len(seq)-1):
front_seq = seq[n]
rear_seq = seq[n+1]
overlap = []
i = 0
for i in range(len(front_seq)):
j = i + 1
for j in range(len(front_seq)+1):
if rear_seq.find(front_seq[i:j]) == -1:
break
else:
overlap.append(front_seq[i:j])
sequence.append(max(overlap, key=len))
#记录开始和结尾序列
front = seq[0].replace(sequence[0], '')
rear = seq[-1].replace(sequence[-1], '')
#删去overlap中重复字符串
i = 0
while i < len(sequence):
temp = sequence[-1]
if sequence[i] == temp:
break
if sequence[i] in sequence[i+1]:
sequence.pop(i)
elif sequence[i+1] in sequence[i]:
sequence.pop(i+1)
else:
i += 1
#拼接
contigs = front + ''.join(sequence) + rear
print(contigs)
09-26