[生信]基于python的CodonTable包对基因序列进行翻译可替换密码子表

|
运行环境:centos
语言:python3
相关程序包:argparse、Bio

|
程序:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
from Bio import SeqIO
from Bio.Data import CodonTable
from Bio.Seq import Seq

# 获取密码子表全部分类
def check_codon_table_type():
    table_names = list(CodonTable.unambiguous_dna_by_name.keys())
    names_idx = []
    for name in table_names:
        names_idx.append("{}:{} ".format(str(table_names.index(name)), str(name)))
    names_idx = str(names_idx).replace("', '", " | ").replace("['", "").replace("']", "")
    return [names_idx, table_names]


def trans(prefix, fa_file, gff_file, codon_table_id):
    # table_id to name
    table_list = check_codon_table_type()[1]
    if codon_table_id not in range(len(table_list)):
        raise Exception("codon编号输入有误")
    codon_table_name = table_list[codon_table_id]
    codon_table = CodonTable.unambiguous_dna_by_name[codon_table_name]

    # read fa
    fa_data = SeqIO.to_dict(SeqIO.parse(fa_file, "fasta"))

    # read gff
    cds_positions = []
    with open(gff_file, "r") as gff:
        for line in gff:
            if line.startswith("#"):
                continue
            fields = line.strip().split("\t")
            chr_id = fields[0]
            feature_type = fields[2]
            if feature_type == "CDS":
                cds_positions.append((int(fields[3]), int(fields[4]), chr_id))
        
    # trans
    translated_sequences = []
    for start, end, id in cds_positions:
        cds_sequence = fa_data.get(id).seq[start-1:end]
        seq =  str(cds_sequence)
        # 分割序列为三个碱基的密码子列表
        codons = [seq[i:i+3] for i in range(0, len(seq), 3)]
        # 遍历碱基,替换类似于“-CG”, “NNN”的不合法密码子
        for i in range(len(codons)):
            if 'N' in codons[i] or '-' in codons[i]:
                codons[i] = '---'
        cds_sequence = Seq(''.join(codons))
        translated_sequence = cds_sequence.translate(table=codon_table)
        translated_sequences.append((id, translated_sequence))

    # write
    count = {}
    for id in fa_data.keys():
        count[id] = 0
    with open(str(prefix) + '.out.fa', 'w') as outf:
        for l in translated_sequences:
            outf.write('>' + l[0] + '_' + str(count[l[0]]) + '\n')
            count[l[0]] += 1
            outf.write(str(l[1]) + '\n')
    return 0

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--prefix', required=True, type=str, help="输出前缀")
    parser.add_argument('--fa', required=True, type=str, help="基因组")
    parser.add_argument('--gff', required=True, type=str, help="基因组注释")
    parser.add_argument('--codon', required=True, type=int, help = "密码子表分类编号:[0-44], 具体如下:")
    parser.epilog = "编号与分类对应关系:" + check_codon_table_type()[0]
    args = parser.parse_args()
    trans(args.prefix, args.fa, args.gff, args.codon)

if __name__ == "__main__":
    main()
  • 7
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值