最近找基因组注释的时候发现一个基因组只上传了embl格式,为了下游分析想把它分成fasta和gff3,但是该embl格式有一点奇怪,常用的转格式软件转不成功,那就自己写个脚本好了。
该embl文件大概长这样:
ID XXX; XXX; linear; other RNA; XXX; XXX; 177118352 BP.
XX
AC XXX;
XX
AC * _scaf_1
XX
PR Project:PRJEB51672;
XX
DT 04-NOV-2022 (Rel. 133, Created)
XX
DE XXX
XX
KW .
XX
OS Poa pratensis
XX
OC cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina;
OC Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida;
OC Mesangiospermae; Liliopsida; Petrosaviidae; commelinids; Poales; Poaceae;
OC BOP clade; Pooideae; Poodae; Poeae; Poeae Chloroplast Group 2 (Poeae type);
OC Poodinae; Poinae; Poa.
XX
RN [1]
RP 1-177118352
RG XXX
RT ;
RL Submitted (04-NOV-2022) to the INSDC.
XX
FH Key Location/Qualifiers
FH
FT source 1..177118352
FT /mol_type="other RNA"
FT /organism="Poa pratensis"
FT gene complement(11455981..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462"
FT /note="source:PanAnd"
FT mRNA complement(11455981..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001"
FT /note="source:PanAnd"
FT exon complement(11455981..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001.exon.1"
FT /note="source:PanAnd"
FT 3'UTR complement(11455981..11456845)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001.three_prime_utr.4"
FT /note="source:PanAnd"
FT CDS complement(11456846..11457412)
FT /codon_start=1
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_P001"
FT /note="source:PanAnd"
FT /transl_table=1
FT 5'UTR complement(11457413..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001.five_prime_utr.3"
FT /note="source:PanAnd"
FT gene complement(11455981..11461780)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461"
FT /note="source:PanAnd"
FT mRNA complement(join(11455981..11456778,11461090..11461780))
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001"
FT /note="source:PanAnd"
FT exon complement(11455981..11456778)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.exon.1"
FT /note="source:PanAnd"
FT exon complement(11461090..11461780)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.exon.2"
FT /note="source:PanAnd"
FT 3'UTR complement(join(11455981..11456778,11461090..11461145))
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.three_prime_utr.5"
FT /note="ID:Ppr00001aa254461_T001.three_prime_utr.6"
FT /note="source:PanAnd"
FT CDS complement(11461146..11461712)
FT /codon_start=1
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_P001"
FT /note="source:PanAnd"
FT /transl_table=1
FT 5'UTR complement(11461713..11461780)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.five_prime_utr.4"
FT /note="source:PanAnd"
//
好多东西暂时没用,比如UTR之类的,就先不提取了。
#注释过几天再补!
#!/usr/bin/perl
use strict; use warnings; use Getopt::Std;
my $usage = "embl2other.pl -e <embl file> -g <output gff3 file> or -f <output fasta file>";
my %opt;
getopts('e:g:f:', \%opt);
my $AC;
#如果需要转成fasta
if($opt{
f}){
my $SQ;
#创建或覆盖目标文件
open(FASTA,">","$opt{f}");
print "";
close(FASTA);
#打开文件
open(FASTA,">>","$opt{f}");
open(EMBL,"$opt{e}") or die $!;
while(<EMBL>){
#提取AC,即染色体/contig编号,源文件中有两个AC,抛弃内容只有XXX的一个,再存入文件
if(m/^AC/){
next if (split(/\s+/))[1] eq "XXX;";
$AC = (split(/\s+/))[2];
print FASTA ">".$AC."\n";
}
#从SQ行到\\行之间提取序列并存入目标文件
if(m/^SQ/ .. m/^\/\//){
next if (m/^SQ/);
next if (m/^\/\//);
s/[^A-Z]//g;
$SQ = $_;
print FASTA $SQ."\n";
}
}
#关闭文件
close(FASTA);
close(EMBL)