ncbi下载的数据不一定符合链特异性转录组的格式,写个小脚本进行处理一下:
#!perl
use warnings;
use strict;
die qq/
perl $0 <type> <outprefix> <file1> [<file2> ...]
type: "gtf", "gene" and "genome".
Note: "genome" need gtf.list of "gtf", and gtf.list must be the end option of <file...>.
/ if(@ARGV < 3);
my $t = shift @ARGV;
my $out = shift @ARGV;
my $py = "gbk2gtf.py";
if($t eq "gtf")
{
my @gtf;
foreach(@ARGV)
{
system "cat $_ | python $py > $_.tmp";
push @gtf, "$_.tmp";
}
system "cat @gtf > $out.gtf";
system "cut -f 1 $out.gtf | sort | uniq > gtf.list";
system "rm *.tmp -rf";
}elsif($t eq "gene"){
my @gene;
foreach my $f(@ARGV)
{
open FF, $f or die $!;
open OUT, "> $f.tmp" or die $!;
while(<FF>)
{
next if(/^\n/);
chomp;
if(/^>/)
{
my ($id) = $_ =~ /locus_tag=(\w+)/;
print OUT ">$1\n";
}else{
print OUT "$_\n";
}
}
close FF;
close OUT;
push @gene, "$f.tmp";
}
system "cat @gene > $out.gene.fa";
system "rm *.tmp -rf";
}elsi