提取细菌基因组ORF思路:
1.通过FNA文件得到细菌基因组序列
2.分正负链和三个相位共6种情况统计ORF
3.写入文件
转载请保留出处!
贴上Python代码(版本:3.6)
1 # -*- coding: utf-8 -*-
2 """
3 Created on Thu Dec 14 13:19:00 2017
4
5 @author: zxzhu
6 """
7
8 import re
9 def N2M(sequence): #正负链转换
10 hash = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C','N':'N'}
11 sequence = ''.join([hash[i] for i in sequence])
12 return sequence[::-1]
13
14 def translate(seq): #将序列转换为起始,终止,其他密码子
15 pa1 = re.compile(r'TAA|TAG|TGA')
16 after_trans = ''
17 for i in range(0,len(seq),3):
18 if seq[i:i+3]=='ATG':
19 after_trans+='I'
20 elif pa1.match(seq[i:i+3]):
21 after_trans+='T'
22 else:
23 after_trans+='O'
24 return after_trans
25
26 def get_orf(seq,length=90):
27 pa2 = re.compile(r'I[IO]+?T') #匹配模式:起始1非终止1~N终止1
28 trans_seq = translate(seq)
29 m = pa2.finditer(trans_seq) #所有匹配结果的迭代
30 index = []
31 orf = []
32 for i in m:
33 index.append(i.span()) #序列起始,终止位置
34 for i in index:
35 orf_start = i[0]*3
36 orf_end = i[1]*3
37 #print(orf_start,orf_end)
38 if orf_end - orf_start >= length: #不小于90bp
39 orf.append(seq[orf_start:orf_end])
40 return orf
41
42 def Seq2AA(sequence,hash): #翻译为AA序列
43 AA=''
44 for i in range(0, len(sequence) - 3, 3):
45 AA += hash[sequence[i:i + 3]]
46 return AA
47
48 def main(fna,length=90):
49 fn = open(fna)
50 pa = re.compile(r'\s+')
51 hash_seq = {} # CDS hash,CDS2sequence
52 result1 = open('orf_seq.txt','w')
53 result2 = open('orf_AA.txt','w')
54 start = [0,1,2] #相位
55 strain = '+-' #正负链
56 hash_AA = {} # AA hash,sequence2AA
57 with open('AA.txt', 'r') as f: #AA.txt 为密码子表
58 for line in f:
59 line = line.strip()
60 if line:
61 line = pa.split(line)
62 hash_AA[line[0]] = line[1] #AA hash
63
64 for line in fn: #获取序列
65 line = line.strip()
66 if line.startswith('>'):
67 A = pa.split(line)[0].replace('>', '')
68 hash_seq[A] = ''
69 else:
70 hash_seq[A] += line
71
72 for key in hash_seq.keys(): #分+-链,3个相位统计ORF
73 seq = hash_seq[key]
74 for r in strain:
75 if r == '-':
76 seq = N2M(seq)
77 for s in start:
78 seq = seq[s:]
79 #trans_seq = translate(seq)
80 orf = get_orf(seq)
81 for i in orf:
82 if 'N' not in i: #去除N
83 AA =Seq2AA(i,hash_AA)
84 result1.write('>'+key+'\t'+r+'\t'+str(s)+'\n'+i+'\n')
85 result2.write('>'+key+'\t'+r+'\t'+str(s)+'\n'+AA+'\n')
86 fn.close()
87 result1.close()
88 result2.close()
89
90
91 fna = 'GCA_000160075.2_ASM16007v2_genomic.fna'
92 main(fna)
NCBI可以找ORF,很方便。码一下:ORFfinder