这还是那个妹子写的,博主提供了微量的帮忙和测试,觉得很不错,自己又懒得写,分享下:
#!perl -w
use strict;
die "Usage : perl $0 <in.snp> <out.file> [gff]" unless (@ARGV >=2);
my ($in, $out, $gff) = @ARGV;
$gff ||= 'refGene.ann.gz';
open IN, $in || die $!;
my ($tmp, $line, $rs, $chr, $site , $start, $ori, $num, $end, @samps, $ref, $alt, %infos, $dep, @tmps);
my %refs = ('M' => 'AC', 'R' => 'AG', 'W' => 'AT', 'S' => 'CG', 'Y' => 'CT', 'K' => 'GT');
while(<IN>)
{
chomp;
($chr, $start, $end) = (split)[1,2,2];
$infos{$chr}{$start}{'end'} = $end;
@{$infos{$chr}{$start}{'ann'}} = ();
$infos{$chr}{$start}{'gene'} = '-';
}
close IN;
open GFF, "gzip -dc $gff |" || die $!;
my ($up, $down);
my ($name, $id, %ids, $info, $region);
while(<GFF>)
{
chomp;
($chr, $region, $start, $end, $num, $ori, $info) = (split /\t/, $_)[0,2,3,4,5,6,8];
if($region =~ /intergenic/){
for $site (sort {$a <=> $b} keys %{$infos{$chr}}){
next unless ($site >= $start and $infos{$chr}{$site}{'end'} <= $end and $infos{$chr}{$site}{'gene'} eq '-');
$up = $site - $start;
$down = $end - $site;
my ($g1, $g2) = $info =~ /name=([\.\-\w]+);.+name=([\.\-\w]+);/;
push @{$infos{$chr}{$site}{'ann'}}, "$g1(dist=$up),$g2(dist=$down):$region";
}
}elsif($region =~ /RNA/)
{
next unless ($info =~ /ID=([-\w]+); name=([-\.\w]+);/);
($id, $name) = ($1, $2);
$ids{$id} = $name;
}else
{
next unless ($info =~ /Parent=([-\w]+);/);
$id = $1;
$name = $ids{$id};
for $site (sort {$a <=> $b} keys %{$infos{$chr}})
{
next unless ($site >= $start and $infos{$chr}{$site}{'end'} <= $end);
my $tmp = $region;
if ($region !~ /UTR/){
$tmp .= $num;
}
push @{$infos{$chr}{$site}{'ann'}}, "$name($id):$tmp";
if($infos{$chr}{$site}{'gene'} eq '-' or $infos{$chr}{$site}{'gene'} =~ /$name/){
$infos{$chr}{$site}{'gene'} = $name;
}else{
$infos{$chr}{$site}{'gene'} .= ",$name";
}
}
}
}
close GFF;
if($out =~ /\.gz$/){open OUT, "| gzip > $out" || die $!;}
else{open OUT, "> $out" || die $!;}
print OUT "#Chromosome\tStart\tEnd\tGene\tRegion\n";
for $chr (sort keys %infos)
{
for $site (sort {$a <=> $b} keys %{$infos{$chr}})
{
print OUT "$chr\t$site\t$infos{$chr}{$site}{'end'}\t$infos{$chr}{$site}{'gene'}\t";
my $i = 1;
for my $tmp ( @{$infos{$chr}{$site}{'ann'}}){
if ($i == 1){print OUT "$tmp";}
else {print OUT ";$tmp";}
$i ++;
}
print OUT "\n";
}
}
close OUT;
承接之前的加强版的gff格式。