lin_extract_5utr_cds_3utr.py

不懂python不懂R

已于 2024-03-25 14:24:58 修改

阅读量337

点赞数 7

分类专栏： python 文章标签： python

于 2024-03-21 09:26:19 首次发布

本文链接：https://blog.csdn.net/weixin_44231554/article/details/136896723

版权

python 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

1.输入文件为基因组文件和gff3文件，输出为5utr和3utr，并且utr已经考虑了正负链和可变剪接情况，意思是如果utr存在可变剪接，输出的文件已经给拼接好了，并且考虑了正负链和拼接方向

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# python lin_extract_5utr_cds_3utr2.py iwgsc_refseqv2.1_annotation_200916_HC_LC.gff3 ../iwgsc_refseqv2.1_assembly.fa output_5utr.fasta2 output_3utr.fasta2
import sys
from Bio import SeqIO
from collections import defaultdict

def extract_and_concatenate_utrs(gff_file, genome_file, output_5utr, output_3utr):
    genome = SeqIO.to_dict(SeqIO.parse(genome_file, "fasta"))
    utrs_5 = defaultdict(lambda: {'+': [], '-': []})
    utrs_3 = defaultdict(lambda: {'+': [], '-': []})

    with open(gff_file, 'r') as gff:
        for line in gff:
            if line.startswith('#') or not line.strip():
                continue
            parts = line.strip().split('\t')
            if len(parts) < 9:
                continue
            seqid, source, feature_type, start, end, score, strand, phase, attributes = parts
            if feature_type not in ['five_prime_UTR', 'three_prime_UTR']:
                continue

            attr_dict = {attr.split('=')[0]: attr.split('=')[1] for attr in attributes.split(';') if '=' in attr}
            parent_id = attr_dict.get('Parent', 'unknown_parent')

            utr_seq = genome[seqid].seq[int(start)-1:int(end)]
            if strand == '-':
                utr_seq = utr_seq.reverse_complement()  # Reverse complement for negative strand
            if feature_type == 'five_prime_UTR':
                utrs_5[parent_id][strand].append((int(start), int(end), utr_seq))
            elif feature_type == 'three_prime_UTR':
                utrs_3[parent_id][strand].append((int(start), int(end), utr_seq))

    def write_utrs(utrs_dict, output_file):
        with open(output_file, 'w') as outfile:
            for parent_id in utrs_dict:
                for strand in utrs_dict[parent_id]:
                    # Sort by end position in descending order for negative strand, by start position for positive strand
                    sorted_utrs = sorted(utrs_dict[parent_id][strand], key=lambda x: x[1] if strand == '-' else x[0], reverse=(strand == '-'))
                    concatenated_seq = ''.join([str(utr[2]) for utr in sorted_utrs])
                    if concatenated_seq:  # Ensure the concatenated sequence is not empty
                        outfile.write(f">{parent_id}\n{concatenated_seq}\n")

    write_utrs(utrs_5, output_5utr)
    write_utrs(utrs_3, output_3utr)

if __name__ == '__main__':
    if len(sys.argv) < 5:
        print("Usage: python extract_utrs.py <gff_file> <genome_file> <output_5utr.fasta> <output_3utr.fasta>")
        sys.exit(1)

    gff_file = sys.argv[1]
    genome_file = sys.argv[2]
    output_5utr = sys.argv[3]
    output_3utr = sys.argv[4]

    extract_and_concatenate_utrs(gff_file, genome_file, output_5utr, output_3utr)