引言
最近为了给平台上加上一个将DNA序列翻译为蛋白序列的工具,写了一个任何生信玩家初学时都会写的代码。看了一些别人的翻译工具,我也想尽量把代码写的完整一点,在这个过程中首次接触并使用了BioPython,目前看起来还是很好用的。
代码
#!/bin/python3
from Bio.Seq import translate, reverse_complement
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
import os
import sys
import getopt
import warnings
import time
warnings.filterwarnings("ignore")
def translate2(input_file, translated_file, corr_nucl_file, corr_prot_file, summary_file, read_start, rev_comp, no_stop, genetic_code):
'''
input_file: nucleic acid sequence fasta file
translated_file: output file 1, protein sequence fasta file
summary_file: input file information
read_start: 1(default). The position that translation starts
rev_comp: no(default). Do not use reverse complemetary sequences for translation.
non_stop: yes(default). Show stop codon as * in translated sequence.
'''
#Translate
nucl_dict = SeqIO.to_dict(SeqIO.parse(input_file,"fasta"), key_function = lambda rec: rec.description) #keep whitespace in FASTA header
prot_dict = {
}
corr_nucl_dict = {
}
corr_prot_dict = {
}
read_start = int(read_start)
stop_sign = 0 #count how many sequences contain stop codon
for key in nucl_dict:
nucl_seq = nucl_dict[key].seq[read_start-1:]
if rev_comp == 'no':
prot_seq = translate(nucl_seq,table=genetic_code)
else:
prot_seq = translate(reverse_complement(nucl_seq),table=genetic_code)
prot_dict[key] = prot_seq
if '*' in prot_seq:
stop_sign