目的:统计fasta中序列AACA的数量
1、方法1,不支持序列换行
'''
satistics Num "AACA" by fasta file
not support each_basic by multi-line
demoDATA:
>chr1
AAAA
>chr2
AAAA
'''
import re
input = "test.fa"
seq_sum={}
with open(input, 'r')as seq_file:
for line in seq_file:
if line.startswith('>'):
seq_id = line[1:-1]
seq_sum[seq_id] = 0
else:
it = len(re.findall("AACA", line))
# print(it)
seq_sum[seq_id] = it
print(seq_sum)
2、先将fa转换格式,再统计
import re
input = "xtt_3.fa"
output = "out.txt"
dicCount = {}
file = open(input, "r")
out = open(output, "w")
for line in file:
line = line.strip()
if line.startswith(">"):
out.write("\n{0}\n".format(line))
else:
out.write("{0}".format(line))
out.close()
file.close()
out1 = open(output, "r")
for line in out1:
line = line.strip()
if len(line) != 0:
if line.startswith(">"):
a = line[1:]
dicCount[a] = 0
else:
count = re.findall("AACA", line)
dicCount[a] = len(count)
print(dicCount)'
3、方法三,建立字典
'''
satistics Num "AACA" by fasta file
support each_basic by multi-line
'''
import re
input = 'xtt_3.fa'
output = '3/xtt_3_out.txt'
seq = {}
seq_sum = {}
with open(input, 'r')as seq_file:
for line in seq_file:
if line.startswith('>'):
name = line.split()[0][1:]
seq[name] = ""
else:
seq[name] = seq[name] + line.replace("\n", "")
for i in seq:
count = len(re.findall("AACA", seq[i]))
seq_sum[i] = count
print(seq_sum)
with open(output, 'w') as newfile:
newfile.write("ID\tcounts\n")
for key, value in seq_sum.items():
newfile.write(key + '\t' + str(value) + '\n')
4、方法4 精简版
'''
write by xialei
satistics Num "AACA" by fasta file
support each_basic by multi-line
'''
import re
seq_sum = {}
input = '3/xtt_3.fa'
with open(input, 'r') as f:
l = f.read()
# chr_ = re.split('>(.*)\n',l)
ls = l.split('>')
ls.remove("")
for each in ls:
print(each)
k = re.findall('(chr.*)\n', each)[0]
it = len(re.findall("AACA", each.replace('\n', '')))
seq_sum[k] = it
print(seq_sum)