这个程序,用perl语言实现了RNA序列翻译蛋白质序列的过程。
考虑了AG GU alternative splicing influence and start coden, stop coden
统计了RNA序列可生产蛋白质长度
引用请注明出处
#!/usr/bin/perl -w # Program name: statProtein_splicing.pl # Author : sunchen # Contact : bbsunchen@gmail.com # Date : 10/22/2011 # Last Update : 10/27/2011 # Reference : Please cite our following papers when you are using this script. # Description : #=============================================================================================================== use warnings; use strict; use Getopt::Long; #my %opts; #GetOptions(\%opts,"dir:s"); #my $usage= <<"USAGE"; # Program: $0 # INPUT: # -dir full path of file # OUTPUT: #USAGE #die $usage unless ($opts{dir} && -e $opts{dir}); #open DIR, $opts{dir}; #create a hash table contain the coden and Aa my %Aa= ( "UUU"=>"F", "UUC"=>"F", "UUA"=>"L", "UUG"=>"L", "CUU"=>"L", "CUC"=>"L", "CUA"=>"L", "CUG"=>"L", "AUU"=>"I", "AUC"=>"I", "AUA"=>"I", "AUG"=>"START", "GUU"=>"V", "GUC"=>"V", "GUA"=>"V", "GUG"=>"V", "UCU"=>"S", "UCC"=>"S", "UCA"=>"S", "UCG"=>"S", "CCU"=>"P", "CCC"=>"P", "CCA"=>"P", "CCG"=>"P", "ACU"=>"T", "ACC"=>"T", "ACA"=>"T", "ACG"=>"T", "GCU"=>"A", "GCC"=>"A", "GCA"=>"A", "GCG"=>"A", "UAU"=>"Y", "UAC"=>"Y", "UAA"=>"END", "UAG"=>"END", "CAU"=>"H", "CAC"=>"H", "CAA"=>"Q", "CAG"=>"Q", "AAU"=>"N", "AAC"=>"N", "AAA"=>"K", "AAG"=>"K", "GAU"=>"D", "GAC"=>"D", "GAA"=>"E", "GAG"=>"E", "UGU"=>"C", "UGC"=>"C", "UGA"=>"END", "UGG"=>"W", "CGU"=>"R", "CGC"=>"R", "CGA"=>"R", "CGG"=>"R", "AGU"=>"S", "AGC"=>"S", "AGA"=>"R", "AGG"=>"R", "GGU"=>"G", "GGC"=>"G", "GGA"=>"G", "GGG"=>"G", ); open DIR, "<E:/lab/4_AUG"; open OUT, ">E:/lab/out.txt"; my $protein = 0; my $coden = 0; my @array; my $array_len = 0; my $if_exist = 0;#$if_exist my $seq = ""; my $start_point = 0; my $end_point = 0; my $order = 0; my @coden_size; sub detect_protein { if($if_exist == 1) { goto END_OF_SUB; } $coden = 0; my($string) = @_; @array=split "", $string; $array_len = @array; my $position = -1; $position = index($string,"AUG"); my $if_start = 0; my $to_print = ""; if($position == -1) { goto END_OF_SUB; }else { $to_print = $to_print."M"; $if_start = 1; $coden++; } my $pro = ""; for(my $i = $position+3; $i < $array_len-2; $i=$i+3) { $pro = $array[$i].$array[$i+1].$array[$i+2]; #print OUT $pro."\n"; #print OUT $Aa{$pro}."\n"; if($Aa{$pro} eq "START") { $coden++; $to_print = $to_print."M"; }elsif($Aa{$pro} eq "END") { $if_start = 2; $to_print = $to_print."\n"; $coden++; last; }else { $coden++; $to_print = $to_print.$Aa{$pro}; } } if($if_start == 1) { $to_print = $to_print."\n"; } if($if_start==2 && $if_exist == 0) { $protein++; $if_exist = 1; #print OUT $order."\n"; if($coden == 72) { print $to_print."\n"; } #print OUT $to_print; $coden_size[$coden]++; #print OUT $string."\n"; } END_OF_SUB: } while(<DIR>) { $order ++; $seq = $_; $if_exist = 0; my $splicing_site_ag = 0; $splicing_site_ag = index($seq,"AG"); my @ag_site=(); push(@ag_site,$splicing_site_ag); while($splicing_site_ag != -1) { #print "$splicing_site_ag\n"; $splicing_site_ag = index($seq,"AG",$splicing_site_ag+2); push(@ag_site,$splicing_site_ag); } my $splicing_site_gu = 0; $splicing_site_gu = rindex($seq,"GU"); my @gu_site=(); push(@gu_site,$splicing_site_gu); while($splicing_site_gu != -1) { #print "$splicing_site_gu\n"; $splicing_site_gu = rindex($seq,"GU",$splicing_site_gu-1); if($splicing_site_gu != -1 && defined($splicing_site_gu)) { push(@gu_site,$splicing_site_gu); }else { last; } } for(my $j = 0; $j < @ag_site;$j++) { if($ag_site[$j] > 270) { last; } if($if_exist == 1) { last; } for(my $k = @gu_site-1; $k >= 0;$k--) { #print JK "J:$ag_site[$j]##########K:$gu_site[$k]\n"; #print OUT "$ag_site[$j] < $gu_site[$k]\n"; if($ag_site[$j] < $gu_site[$k]) { if(($gu_site[$k] - $ag_site[$j] - 2) >= 30) { my $substr = substr($seq, $ag_site[$j]+2, $gu_site[$k] - $ag_site[$j] - 2); #print OUT2 "$substr\n"; my @sub_array = split "",$substr; my $final_length = @sub_array; detect_protein($substr); last; } } } } } for(my $i = 0; $i < @coden_size; $i++) { print "$i\n"; } print "###########################################\n"; for(my $i = 0; $i < @coden_size; $i++) { if(defined($coden_size[$i])) { print "$coden_size[$i]\n"; }else { print "0\n"; } } close DIR; close OUT;