最近找基因组注释的时候发现一个基因组只上传了embl格式,为了下游分析想把它分成fasta和gff3,但是该embl格式有一点奇怪,常用的转格式软件转不成功,那就自己写个脚本好了。
该embl文件大概长这样:
ID XXX; XXX; linear; other RNA; XXX; XXX; 177118352 BP.
XX
AC XXX;
XX
AC * _scaf_1
XX
PR Project:PRJEB51672;
XX
DT 04-NOV-2022 (Rel. 133, Created)
XX
DE XXX
XX
KW .
XX
OS Poa pratensis
XX
OC cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina;
OC Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida;
OC Mesangiospermae; Liliopsida; Petrosaviidae; commelinids; Poales; Poaceae;
OC BOP clade; Pooideae; Poodae; Poeae; Poeae Chloroplast Group 2 (Poeae type);
OC Poodinae; Poinae; Poa.
XX
RN [1]
RP 1-177118352
RG XXX
RT ;
RL Submitted (04-NOV-2022) to the INSDC.
XX
FH Key Location/Qualifiers
FH
FT source 1..177118352
FT /mol_type="other RNA"
FT /organism="Poa pratensis"
FT gene complement(11455981..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462"
FT /note="source:PanAnd"
FT mRNA complement(11455981..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001"
FT /note="source:PanAnd"
FT exon complement(11455981..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001.exon.1"
FT /note="source:PanAnd"
FT 3'UTR complement(11455981..11456845)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001.three_prime_utr.4"
FT /note="source:PanAnd"
FT CDS complement(11456846..11457412)
FT /codon_start=1
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_P001"
FT /note="source:PanAnd"
FT /transl_table=1
FT 5'UTR complement(11457413..11457619)
FT /locus_tag="PPR_LOCUS256280"
FT /note="ID:Ppr00001aa254462_T001.five_prime_utr.3"
FT /note="source:PanAnd"
FT gene complement(11455981..11461780)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461"
FT /note="source:PanAnd"
FT mRNA complement(join(11455981..11456778,11461090..11461780))
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001"
FT /note="source:PanAnd"
FT exon complement(11455981..11456778)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.exon.1"
FT /note="source:PanAnd"
FT exon complement(11461090..11461780)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.exon.2"
FT /note="source:PanAnd"
FT 3'UTR complement(join(11455981..11456778,11461090..11461145))
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.three_prime_utr.5"
FT /note="ID:Ppr00001aa254461_T001.three_prime_utr.6"
FT /note="source:PanAnd"
FT CDS complement(11461146..11461712)
FT /codon_start=1
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_P001"
FT /note="source:PanAnd"
FT /transl_table=1
FT 5'UTR complement(11461713..11461780)
FT /locus_tag="PPR_LOCUS256281"
FT /note="ID:Ppr00001aa254461_T001.five_prime_utr.4"
FT /note="source:PanAnd"
//
好多东西暂时没用,比如UTR之类的,就先不提取了。
#注释过几天再补!
#!/usr/bin/perl
use strict; use warnings; use Getopt::Std;
my $usage = "embl2other.pl -e <embl file> -g <output gff3 file> or -f <output fasta file>";
my %opt;
getopts('e:g:f:', \%opt);
my $AC;
#如果需要转成fasta
if($opt{f}){
my $SQ;
#创建或覆盖目标文件
open(FASTA,">","$opt{f}");
print "";
close(FASTA);
#打开文件
open(FASTA,">>","$opt{f}");
open(EMBL,"$opt{e}") or die $!;
while(<EMBL>){
#提取AC,即染色体/contig编号,源文件中有两个AC,抛弃内容只有XXX的一个,再存入文件
if(m/^AC/){
next if (split(/\s+/))[1] eq "XXX;";
$AC = (split(/\s+/))[2];
print FASTA ">".$AC."\n";
}
#从SQ行到\\行之间提取序列并存入目标文件
if(m/^SQ/ .. m/^\/\//){
next if (m/^SQ/);
next if (m/^\/\//);
s/[^A-Z]//g;
$SQ = $_;
print FASTA $SQ."\n";
}
}
#关闭文件
close(FASTA);
close(EMBL);
}
#如果需要转成gff3
if($opt{g}){
my $feature;
my @start;
my @end;
my $strand;
my $LOC;
my @ID;
my $source;
my $gene_id;
my @mRNA_id;
my $cds_num;
#创建或覆盖目标文件
open(GFF,">","$opt{g}");
print "";
close(GFF);
#打开文件
open(GFF,">>","$opt{g}");
open(EMBL,"$opt{e}") or die $!;
while(<EMBL>){
#提取AC,抛弃只含XXX的
if(m/^ID/ .. m/^\/\//){
if(m/^AC/){
next if (split(/\s+/))[1] eq "XXX;";
$AC = (split(/\s+/))[2];
}
#当到达FT注释行时
if(m/^FT/){
#跳过source注释和gap注释
next if (m/ source|\/mol_type|\/organism|gap|\/estimated_length/);
#按空格分割每行
my @FT_line = split(/\s+/);
#空格分隔后有三列时,提取feature、start、end、strand
if(@FT_line == 3){
$feature = $FT_line[1];
#调用自定义函数getcoor
my ($start_res,$end_res) = getcoor($_);
@start = @$start_res;
@end = @$end_res;
#包含complement的认为是反向元素
$strand = ($FT_line[2] =~ /complement/ ? "-" : "+");
#初始化ID
@ID = ();
}
#空格分隔后有两列时,提取LOC、ID、source
if(@FT_line == 2){
#跳过3'UTR、5'UTR的注释以及CDS注释的最后一行
next if ($feature =~ /3'UTR|5'UTR/);
next if (m/\/transl_table=/);
$LOC = $1 if(m/\/locus_tag="(.*)"/);
push(@ID,$1) if(m/\/note="ID:(.*)"/);
$source = $1 if(m/\/note="source:(\w+)"/);
}
#当读取到feature为gene,且读到source,即每个feature的最后一行时将对应gene元素存入文件
if($feature eq "gene" and $source){
my @gene_output = ($AC,$source,$feature,$start[0],$end[0],"\.",$strand,"\.","ID=$ID[0];locus_tag=$LOC");
print GFF join("\t",@gene_output)."\n";
#初始化source、mRNA_id、cds_num,保存gene_id
$source = undef;
$gene_id = $ID[0];
@mRNA_id = ();
$cds_num = 0;
}
#当读取到feature为mRNA,且读到source,即每个feature的最后一行时将对应mRNA、exon元素存入文件
if($feature eq "mRNA" and $source){
#存入mRNA元素
my @mRNA_output = ($AC,$source,$feature,$start[0],$end[-1],"\.",$strand,"\.","ID=$ID[0];Parent=$gene_id");
print GFF join("\t",@mRNA_output)."\n";
push(@mRNA_id,$ID[0]);
#存入exon元素
for my $e_num (0 .. @start-1){
my @exon_output = ($AC,$source,"exon",$start[$e_num],$end[$e_num],"\.",$strand,"\.","ID=$ID[0].exon.".($e_num+1).";Parent=$ID[0]");
print GFF join("\t",@exon_output)."\n";
###如果源文件中exon命名不是从1开始连续的增加,那这里会有和源文件对不上的问题,暂时不知道该怎么优化
}
#初始化source
$source = undef;
}
#原本想在这里提取exon保证exon命名统一,但是不同mRNA的共享exon该embl文件中只有1个,
#只靠exon这个feature无法得到与mRNA的对应关系,如果通过坐标判断就太繁琐了,
#所以改在处理mRNA的时候一并提取exon
#这里就相当于跳过exon的feature段
if($feature eq "exon" and $source){
$source = undef;
next;
}
#当读取到feature为CDS,且读到source,即每个feature的最后一行时将对应CDS元素存入文件
if($feature eq "CDS" and $source){
#定义phase,按坐标来说,正向的第一个CDS和反向的最后一个CDS的phase一般一直都是0
my @phase = (0);
#当strand为正向,且CDS数量大于一个时
if($strand eq "+" and @start > 1){
#计算第二个到最后一个CDS的phase
for my $num (0 .. @start-2){
$phase[$num+1] = (3-($end[$num]-$start[$num]+1+$phase[$num])%3);
$phase[$num+1] = 0 if ($phase[$num+1] == 3);
}
#从第一个CDS开始写入GFF
for my $i (0 .. @start-1){
my @CDS_output = ($AC,$source,$feature,$start[$i],$end[$i],"\.",$strand,$phase[$i],"ID=$ID[0].cds.".($i+1).";Parent=$mRNA_id[$cds_num]");
print GFF join("\t",@CDS_output)."\n";
}
#当strand为反向,且CDS数量大于1个时
}elsif($strand eq "-" and @start > 1){
#计算倒数第二个到第一个CDS的phase
for my $num (0 .. @start-2){
$phase[$num+1] = (3-($end[@start-2-$num+1]-$start[@start-2-$num+1]+1+$phase[$num])%3);
$phase[$num+1] = 0 if ($phase[$num+1] == 3);
}
#从第一个CDS开始写入GFF
for my $i (0 .. @start-1){
my @CDS_output = ($AC,$source,$feature,$start[$i],$end[$i],"\.",$strand,$phase[@start-1-$i],"ID=$ID[0].cds.".($i+1).";Parent=$mRNA_id[$cds_num]");
print GFF join("\t",@CDS_output)."\n";
}
#当只存在一个CDS的时候,直接按phase=0写入
}elsif(@start == 1){
my @CDS_output = ($AC,$source,$feature,$start[0],$end[0],"\.",$strand,"0","ID=$ID[0].cds."."1".";Parent=$mRNA_id[$cds_num]");
print GFF join("\t",@CDS_output)."\n";
}
#重置source变量,准备下一轮判断
$source = undef;
#每完成一轮CDS的输出,计数加1
$cds_num += 1;
#该格式文件中mRNA数量应该和cds输出轮数相等,如果cds轮数大于mRNA数量说明解析出错
#其实小于应该也要die掉,但判断就不能放在这里了,还得修改一下
die("Number of CDS and Number of mRNA could not be paired!") if ($cds_num > scalar(@mRNA_id));
}
}
}
}
close(EMBL);
close(GFF);
}
#自定义函数,作用是给一个feature的起始行进去,成对输出若干个元素的起始位点和终止位点
sub getcoor{
my ($line) = @_;
my @start;
my @end;
#去掉行尾换行符
chomp $line;
#当行末尾存在逗号时
if($line =~ /,$/){
#去掉行尾换行符
chomp $line;
#按空格分割取第三列,即保存坐标的列
$line = (split(/\s+/,$line))[2];
#读取该行的下一行
my $next_line = <EMBL>;
#当下一行末尾也存在逗号时
while($next_line =~ /,$/){
#去掉行尾的换行符
chomp $next_line;
#按空格分割取第二列,即保存坐标的列
my $next_line_split = (split(/\s+/,$next_line))[1];
#将该行添加到上一行末尾存入line变量
$line .= $next_line_split;
#提取再下一行继续while判断
$next_line = <EMBL>;
}
#去掉不满足while条件的行末尾的换行符
#其实这个地方有点小bug,如果没进入上面的循环的话next_line应该处于未声明状态,可能会导致warning
chomp $next_line;
#按空格分隔取该行第二列,即保存坐标的列
my $next_line_split = (split(/\s+/,$next_line))[1];
#将该行添加到上一行末尾存入line变量
$line .= $next_line_split;
}
#按逗号分隔line并按组存入group中
my @group = split(",",$line);
#遍历group分别按顺序提取起始位点和终止位点存入两个数组中
foreach my $group (@group){
if($group =~ /(\d+)\.\.(\d+)/){
push(@start,$1);
push(@end,$2);
}
}
#返回两个数组
return(\@start,\@end);
}
写完之后感觉还是面向对象编程的能力不是很够,这个脚本还是特解,想快速写出能获得通解的脚本还是能力不够,继续锻炼吧。(如果有大佬路过请多指教,谢谢!)