chr1A NRGenome gene 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1
chr1A NRGenome mRNA 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;coverage=100.0;identity=100.0;matches=1059;mismatches=0;indels=0;unknowns=0
chr1A NRGenome exon 1157233 1158291 100 + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +
chr1A NRGenome CDS 1157233 1158291 100 + 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +
chr1A NRGenome gene 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1
chr1A NRGenome mRNA 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;coverage=100.0;identity=100.0;matches=1527;mismatches=0;indels=0;unknowns=0
chr1A NRGenome exon 1162250 1162591 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +
chr1A NRGenome exon 1161953 1162150 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +
chr1A NRGenome exon 1161682 1161859 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +
chr1A NRGenome exon 1161377 1161547 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +
chr1A NRGenome exon 1160679 1160710 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +
chr1A NRGenome exon 1160535 1160577 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +
chr1A NRGenome exon 1160392 1160459 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +
chr1A NRGenome exon 1160086 1160127 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +
chr1A NRGenome exon 1159521 1159973 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +
chr1A NRGenome CDS 1162250 1162591 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +
chr1A NRGenome CDS 1161953 1162150 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +
chr1A NRGenome CDS 1161682 1161859 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +
chr1A NRGenome CDS 1161377 1161547 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +
chr1A NRGenome CDS 1160679 1160710 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +
chr1A NRGenome CDS 1160535 1160577 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +
chr1A NRGenome CDS 1160392 1160459 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +
chr1A NRGenome CDS 1160086 1160127 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +
chr1A NRGenome CDS 1159521 1159973 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +
chr1A NRGenome gene 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3
chr1A NRGenome mRNA 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;coverage=100.0;identity=100.0;matches=1434;mismatches=0;indels=0;unknowns=0
chr1A NRGenome exon 1162546 1162591 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +
chr1A NRGenome exon 1162250 1162452 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +
chr1A NRGenome exon 1161953 1162150 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +
chr1A NRGenome exon 1161682 1161859 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +
chr1A NRGenome exon 1161377 1161547 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +
chr1A NRGenome exon 1160679 1160710 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +
chr1A NRGenome exon 1160535 1160577 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +
chr1A NRGenome exon 1160392 1160459 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +
chr1A NRGenome exon 1160086 1160127 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +
chr1A NRGenome exon 1159521 1159973 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +
chr1A NRGenome CDS 1162546 1162591 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +
chr1A NRGenome CDS 1162250 1162452 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +
chr1A NRGenome CDS 1161953 1162150 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +
chr1A NRGenome CDS 1161682 1161859 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +
chr1A NRGenome CDS 1161377 1161547 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +
chr1A NRGenome CDS 1160679 1160710 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +
chr1A NRGenome CDS 1160535 1160577 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +
chr1A NRGenome CDS 1160392 1160459 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +
chr1A NRGenome CDS 1160086 1160127 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +
chr1A NRGenome CDS 1159521 1159973 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +
python代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from Bio import SeqIO
# fasta = open("/data2/masw_data/seqdb/chr1A.fasta", "rU")
record_dict = SeqIO.index("/data2/masw_data/seqdb/chr1A.fasta", "fasta")
gene_sequence = open('gene.fasta', 'w')
mRNA_sequence = open('mRNA.fasta', 'w')
exon_sequence = open('exon.fasta', 'w')
CDS_sequence = open('CDS.fasta','w')
pro_downstream = open('pro_and_downstream.fasta', 'w')
gene = {}
mRNA = {}
exon = {}
CDS = {}
with open('1.txt', 'r') as f:
for line in f:
line1 = line.strip().split()
chr = line1[0]
feature = line1[2]
start = line1[3]
end = line1[4]
direction = line1[6]
name = line1[8].split(";")[1][5:]
if feature == 'gene':
gene[name] = (chr, start, end, direction)
if line1[2] == 'mRNA':
mRNA[name] = (chr, start, end, direction)
if line1[2] == 'exon':
if exon.get(name, None):
exon[name].extend([(chr, start, end, direction)])
else:
exon[name] = [(chr, start, end, direction)]
if line1[2] == 'CDS':
if CDS.get(name, None):
CDS[name].extend([(chr, start, end, direction)])
else:
CDS[name] = [(chr, start, end, direction)]
# get gene sequence include introns
for key, value in gene.items():
if value[3] == '+':
gene_sequence.write('>%s\n%s\n' % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq))
if value[3] == '-':
gene_sequence.write('>%s\n%s\n' % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))
# get mRNA sequence include introns
for key, value in mRNA.items():
if value[3] == '+':
mRNA_sequence.write('>%s\n%s\n' % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq))
if value[3] == '-':
mRNA_sequence.write('>%s\n%s\n' % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))
# get 2k upstream,1k downstream and gene sequence
for key, value in gene.items():
if value[3] == '+':
pro_downstream.write('>%s\n%s\n' % (key, record_dict[value[0]][int(value[1])-2001:int(value[2]) + 1000].seq))
if value[3] == '-':
pro_downstream.write('>%s\n%s\n' % (key, record_dict[value[0]][int(value[1]) - 999:int(value[2]) + 2000].seq.reverse_complement()))
# get CDS seuqnece
for key, value in CDS.items():
sequence = []
for i in value:
if i[-1] == '+':
sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq)
CDS_sequence.write('>%s\n%s\n' % (key, sequence))
for i in value.reverse:
if i[-1] == '-':
sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement())
CDS_sequence.write('>%s\n%s\n' % (key, sequence))
# get exon sequence
for key, value in exon.items():
sequence = []
for i in value:
if i[-1] == '+':
sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq)
exon_sequence.write('>%s\n%s\n' % (key, sequence))
for i in value.reverse:
if i[-1] == '-':
sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement())
exon_sequence.write('>%s\n%s\n' % (key, sequence))
gene_sequence.close()
mRNA_sequence.close()
CDS_sequence.close()
pro_downstream.close()