1.输入文件为基因组文件和gff3文件,输出为5utr和3utr,并且utr已经考虑了正负链和可变剪接情况,意思是如果utr存在可变剪接,输出的文件已经给拼接好了,并且考虑了正负链和拼接方向
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# python lin_extract_5utr_cds_3utr2.py iwgsc_refseqv2.1_annotation_200916_HC_LC.gff3 ../iwgsc_refseqv2.1_assembly.fa output_5utr.fasta2 output_3utr.fasta2
import sys
from Bio import SeqIO
from collections import defaultdict
def extract_and_concatenate_utrs(gff_file, genome_file, output_5utr, output_3utr):
genome = SeqIO.to_dict(SeqIO.parse(genome_file, "fasta"))
utrs_5 = defaultdict(lambda: {'+': [], '-': []})
utrs_3 = defaultdict(lambda: {'+': [], '-': []})
with open(gff_file, 'r') as gff:
for line in gff:
if line.startswith('#') or not line.strip():
continue
parts = line.strip().split('\t')
if len(parts) < 9:
continue
seqid, source, feature_type, start, end, score, strand, phase, attributes = parts
if feature_type not in ['five_prime_UTR', 'three_prime_UTR']:
continue
attr_dict = {attr.split('=')[0]: attr.split('=')[1] for attr in attributes.split(';') if '=' in attr}
parent_id = attr_dict.get('Parent', 'unknown_parent')
utr_seq = genome[seqid].seq[int(start)-1:int(end)]
if strand == '-':
utr_seq = utr_seq.reverse_complement() # Reverse complement for negative strand
if feature_type == 'five_prime_UTR':
utrs_5[parent_id][strand].append((int(start), int(end), utr_seq))
elif feature_type == 'three_prime_UTR':
utrs_3[parent_id][strand].append((int(start), int(end), utr_seq))
def write_utrs(utrs_dict, output_file):
with open(output_file, 'w') as outfile:
for parent_id in utrs_dict:
for strand in utrs_dict[parent_id]:
# Sort by end position in descending order for negative strand, by start position for positive strand
sorted_utrs = sorted(utrs_dict[parent_id][strand], key=lambda x: x[1] if strand == '-' else x[0], reverse=(strand == '-'))
concatenated_seq = ''.join([str(utr[2]) for utr in sorted_utrs])
if concatenated_seq: # Ensure the concatenated sequence is not empty
outfile.write(f">{parent_id}\n{concatenated_seq}\n")
write_utrs(utrs_5, output_5utr)
write_utrs(utrs_3, output_3utr)
if __name__ == '__main__':
if len(sys.argv) < 5:
print("Usage: python extract_utrs.py <gff_file> <genome_file> <output_5utr.fasta> <output_3utr.fasta>")
sys.exit(1)
gff_file = sys.argv[1]
genome_file = sys.argv[2]
output_5utr = sys.argv[3]
output_3utr = sys.argv[4]
extract_and_concatenate_utrs(gff_file, genome_file, output_5utr, output_3utr)