# Usage: perl gather_family_protein_gene.pl -a name_of_animal -e ../Extract_result -n GATHER
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
#The input options list
my ($animalName,$extractDir,$saveName,$help,$errorLog);
GetOptions(
'help|h' => \$help,
'a:s' => \$animalName,
'e:s' => \$extractDir,
'n:s' => \$saveName,
'r:s' => \$errorLog
);
sub usage{
print <<USAGE
usage:
#version: perl $0 [options]
#author: Oshyn Song <dualyangsong\@gmail.com>
#history: 2013-12-17
#desc: Gather the extract result to a file by species
options:
-h --help:print the info
-a :input the animal species filename
-e :input the extract result directory
-n :the save result filaname
-r :the error log file name
#perl $0
USAGE
}
#Change the STDERR to errorlog file
if (!defined $errorLog){
$errorLog = "errorlog";
}
if (! open (STDERR, ">> ${errorLog}")){
die "Can not open errorlog $!";
}
#Test if given the necessary options
if (defined $help || !(defined $animalName && defined $extractDir && defined $saveName)){
&usage();
exit 0;
}
print "Start process...\n";
#Open the names of all animal species file
if (! open (ANIMALNAME,"< ${animalName}")){
die "Can not open file of ${animalName} $!";
}
print "Open the file of animal species name successfully.\n\n";
#Read the animal species name every line
foreach(<ANIMALNAME>){
chomp;
my $animal_name = $_;
$animal_name = substr($animal_name,0,index($animal_name,"."));
if (!opendir TFFLIST,"${extractDir}/${animal_name}" ){
die "Can not open directory of ${extractDir}/${animal_name}. $!";
}
print "process ${animal_name}...\n";
#Open the result file
if (! open OUT,">> ${extractDir}/${animal_name}/${saveName}"){
die "Can not open ${extractDir}/${animal_name}/${saveName}. $!";
}
#Read every filename and open it
my $filename;
my %gather;
foreach $filename (readdir TFFLIST){
next if $filename =~ /^\./;
my $filepath = "${extractDir}/${animal_name}/$filename";
next unless -f $filepath and -r $filepath;
if (! open FILE, "${filepath}"){
die "Can not open the file : ${filepath} $!";
}
my $line;
while(defined ($line = <FILE>)){
chomp($line);
if ($line =~ /^(ENS[\w]+?[\d]{11})[\t]([0-9e\-\.]+)$/){
my $protein = $1;
my $evalue = $2;
my $tfname = substr($filename,0,index($filename,"."));
if (!exists $gather{$protein}){
$gather{$protein} = "${tfname}=>${evalue}";
}else{
$gather{$protein} = "$gather{$protein}\t|\t${tfname}=>${evalue}";
}
}
if ($line =~ /^(ENS[\w]+?[\d]{11})[\t](ENS[\w]+?[\d]{11})/){
my $p = $1;
my $gene = $2;
if (exists $gather{$p}){
unless (substr($gather{$p},0,3) eq "ENS"){
$gather{$p} = "${gene}\t$gather{$p}";
}
}
}
}
}
close FILE;
while(my($key,$value) = each %gather){
print OUT "${key}=>${value}\n";
}
close OUT;
print "${animal_name} process finished!\n\n";
closedir TFFLIST;
}
close ANIMALNAME;
处理后的结果如下: