|
运行环境:centos
语言:python3
相关程序包:argparse、Bio
|
程序:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
from Bio import SeqIO
from Bio.Data import CodonTable
from Bio.Seq import Seq
# 获取密码子表全部分类
def check_codon_table_type():
table_names = list(CodonTable.unambiguous_dna_by_name.keys())
names_idx = []
for name in table_names:
names_idx.append("{}:{} ".format(str(table_names.index(name)), str(name)))
names_idx = str(names_idx).replace("', '", " | ").replace("['", "").replace("']", "")
return [names_idx, table_names]
def trans(prefix, fa_file, gff_file, codon_table_id):
# table_id to name
table_list = check_codon_table_type()[1]
if codon_table_id not in range(len(table_list)):
raise Exception("codon编号输入有误")
codon_table_name = table_list[codon_table_id]
codon_table = CodonTable.unambiguous_dna_by_name[codon_table_name]
# read fa
fa_data = SeqIO.to_dict(SeqIO.parse(fa_file, "fasta"))
# read gff
cds_positions = []
with open(gff_file, "r") as gff:
for line in gff:
if line.startswith("#"):
continue
fields = line.strip().split("\t")
chr_id = fields[0]
feature_type = fields[2]
if feature_type == "CDS":
cds_positions.append((int(fields[3]), int(fields[4]), chr_id))
# trans
translated_sequences = []
for start, end, id in cds_positions:
cds_sequence = fa_data.get(id).seq[start-1:end]
seq = str(cds_sequence)
# 分割序列为三个碱基的密码子列表
codons = [seq[i:i+3] for i in range(0, len(seq), 3)]
# 遍历碱基,替换类似于“-CG”, “NNN”的不合法密码子
for i in range(len(codons)):
if 'N' in codons[i] or '-' in codons[i]:
codons[i] = '---'
cds_sequence = Seq(''.join(codons))
translated_sequence = cds_sequence.translate(table=codon_table)
translated_sequences.append((id, translated_sequence))
# write
count = {}
for id in fa_data.keys():
count[id] = 0
with open(str(prefix) + '.out.fa', 'w') as outf:
for l in translated_sequences:
outf.write('>' + l[0] + '_' + str(count[l[0]]) + '\n')
count[l[0]] += 1
outf.write(str(l[1]) + '\n')
return 0
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--prefix', required=True, type=str, help="输出前缀")
parser.add_argument('--fa', required=True, type=str, help="基因组")
parser.add_argument('--gff', required=True, type=str, help="基因组注释")
parser.add_argument('--codon', required=True, type=int, help = "密码子表分类编号:[0-44], 具体如下:")
parser.epilog = "编号与分类对应关系:" + check_codon_table_type()[0]
args = parser.parse_args()
trans(args.prefix, args.fa, args.gff, args.codon)
if __name__ == "__main__":
main()