#!/usr/bin/perl
use strict;
use warnings;
=c---------------------------------------
this perl script is edited to compute tajima's D
the origin file path is :
/ifs5/ST_COMG/USER/yanzengli/tajima/data/ChrB01/ChrB01.bb.snp
/ifs5/ST_COMG/USER/yanzengli/tajima/data/ChrB01/ChrB01.bb.tajima
/ifs5/ST_COMG/USER/yanzengli/tajima/data/filter.site.position/ChrB01.filter.comm.gz
edited by bangemantou (yanzengli@genomics.org.cn) 2012-03-29 pm;
=cut
die "\nUsage: compute tajima's D;\ncomands:\nperl $0 snp.data tajima.outfile filter.site.data;\n\n" if (@ARGV != 3);
my $snpdb = shift;
my $tajima_outfile = shift;
my $filter_position = shift;
open AA, $snpdb or die $!;
open BB, ">", $tajima_outfile or die $!;
open (CC, "gzip -dc $filter_position|" ) or die $!;
my $window = 50000;
my $bin = 0.1;
my $cout = int ( 1 / $bin );
my $step = $window * $bin;
my $sample = 0;
my %filter = ();
my %pi = ();
my %snp = ();
while ( <CC> ) {
my @tt = split /\s+/;
my $k = int ( $tt[1] / $step );
$filter{$k} ++;
}
close CC;
my $chr_name;
my $chrlen = 0;
while ( <AA> ) {
chomp;
my @tt = split;
my @line = @tt;
$chr_name = $line[0];
$chrlen = $tt[1];
$sample = ($tt[6]+$tt[7]) / 2;
my $k = int ( $tt[1] / $step );
$pi{$k} += pi ( $line[6], $line[7] );
if ( ($line[6]*$line[7]) != 0 ) { $snp{$k}++; }
}
close AA;
print BB " chrname\tposition\tsum.pi/filter.sites\ttajima's D\tsum.snp\tfilter.site\n";
my $window_num = int ( $chrlen / $step ); #the number of steps
for ( my $i=0; $i<=($window_num-$cout); $i++ ) {
my $sum_pi = 0;
my $sum_snp = 0;
my $filter_site = 0;
my $end = $i + $cout - 1;
for ( my $aa=$i; $aa<=$end; $aa++ ){
if ( !defined($pi{$aa}) ) { $pi{$aa} = 0; }
if ( !defined($snp{$aa}) ) { $snp{$aa} = 0; }
if ( !defined($filter{$aa}) ) { $filter{$aa} = 0; }
$sum_pi += $pi{$aa};
$sum_snp += $snp{$aa};
$filter_site += $filter{$aa};
}
my $d = tajima( $sample, $sum_pi, $sum_snp );
my $id = ($i + 1) * $step;
my $mean_pi = 0;
my $theta = 0;
if ( $filter_site ) { $mean_pi = $sum_pi / $filter_site; }
print BB "$chr_name\t$id\t$mean_pi\t$d\t$sum_snp\t$filter_site\n";
}
close BB;
sub pi {
my $a = $_[0];
my $b = $_[1];
if ( $a == 0 || $b == 0 ) { return 0; }
my $pi = ( 2 * $a * $b ) / ( ($a+$b) * ($a+$b-1) );
return $pi;
}
sub tajima {
my $n = $_[0];
my $pi = $_[1];
my $t = $_[2];
my $a1;
my $a2;
for ( my $i=1; $i<$n; $i++ ){
$a1 += (1 / $i);
$a2 += (1 / ($i*$i));
}
my $b1 = ($n + 1) / ( 3 * ($n-1) );
my $b2 = 2 * ($n*$n + $n + 3) / (9 * $n * ($n-1));
my $c1 = $b1 - (1 / $a1);
my $c2 = $b2 - ($n + 2) / ($a1 * $n) + $a2 / ($a1*$a1);
my $e1 = $c1 / $a1;
my $e2 = $c2 / ($a1*$a1 + $a2);
if ( $t == 0) { return "NA"; next; }
my $d = ( $pi - ($t/$a1) ) / sqrt ( $e1 * $t + $e2 * $t * ($t-1) );
return $d;
}
说明:
1、 snp.data format
chr_name position reference_base ancestral_base major minor major_# minor_# HWE genotypes
ChrB01 945 A A G A 6 30 0.166728 GG/0.000003 GG/0.000198 AG/1.000000 GG/0.000000
ChrB01 946 G G T G 6 30 0.166734 TT/0.000003 TT/0.000394 GT/1.000000 TT/0.000000
ChrB01 1660 G G A G 5 31 0.138889 AA/0.000000 AA/0.000000 AG/1.000000 AA/0.000000
this is the standard format of snp.
2、 filter.site.data
ChrB01 190
ChrB01 191
ChrB01 192
ChrB01 193
3、the algorithm is from tajima's paper.
[] Fumio Tajima, Statistical Method for Testing the Neutral Mutation Hypothesis by DNA Polymorphism