lin_extract_5utr_cds_3utr.py

1.输入文件为基因组文件和gff3文件,输出为5utr和3utr,并且utr已经考虑了正负链和可变剪接情况,意思是如果utr存在可变剪接,输出的文件已经给拼接好了,并且考虑了正负链和拼接方向

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# python lin_extract_5utr_cds_3utr2.py iwgsc_refseqv2.1_annotation_200916_HC_LC.gff3 ../iwgsc_refseqv2.1_assembly.fa output_5utr.fasta2 output_3utr.fasta2
import sys
from Bio import SeqIO
from collections import defaultdict

def extract_and_concatenate_utrs(gff_file, genome_file, output_5utr, output_3utr):
    genome = SeqIO.to_dict(SeqIO.parse(genome_file, "fasta"))
    utrs_5 = defaultdict(lambda: {'+': [], '-': []})
    utrs_3 = defaultdict(lambda: {'+': [], '-': []})

    with open(gff_file, 'r') as gff:
        for line in gff:
            if line.startswith('#') or not line.strip():
                continue
            parts = line.strip().split('\t')
            if len(parts) < 9:
                continue
            seqid, source, feature_type, start, end, score, strand, phase, attributes = parts
            if feature_type not in ['five_prime_UTR', 'three_prime_UTR']:
                continue

            attr_dict = {attr.split('=')[0]: attr.split('=')[1] for attr in attributes.split(';') if '=' in attr}
            parent_id = attr_dict.get('Parent', 'unknown_parent')

            utr_seq = genome[seqid].seq[int(start)-1:int(end)]
            if strand == '-':
                utr_seq = utr_seq.reverse_complement()  # Reverse complement for negative strand
            if feature_type == 'five_prime_UTR':
                utrs_5[parent_id][strand].append((int(start), int(end), utr_seq))
            elif feature_type == 'three_prime_UTR':
                utrs_3[parent_id][strand].append((int(start), int(end), utr_seq))

    def write_utrs(utrs_dict, output_file):
        with open(output_file, 'w') as outfile:
            for parent_id in utrs_dict:
                for strand in utrs_dict[parent_id]:
                    # Sort by end position in descending order for negative strand, by start position for positive strand
                    sorted_utrs = sorted(utrs_dict[parent_id][strand], key=lambda x: x[1] if strand == '-' else x[0], reverse=(strand == '-'))
                    concatenated_seq = ''.join([str(utr[2]) for utr in sorted_utrs])
                    if concatenated_seq:  # Ensure the concatenated sequence is not empty
                        outfile.write(f">{parent_id}\n{concatenated_seq}\n")

    write_utrs(utrs_5, output_5utr)
    write_utrs(utrs_3, output_3utr)

if __name__ == '__main__':
    if len(sys.argv) < 5:
        print("Usage: python extract_utrs.py <gff_file> <genome_file> <output_5utr.fasta> <output_3utr.fasta>")
        sys.exit(1)

    gff_file = sys.argv[1]
    genome_file = sys.argv[2]
    output_5utr = sys.argv[3]
    output_3utr = sys.argv[4]

    extract_and_concatenate_utrs(gff_file, genome_file, output_5utr, output_3utr)
  • 7
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值