reads_classifier.pl
1 #!/usr/bin/perl 2 #=============================================================================== 3 # 4 # FILE: reads_classifier.pl 5 # 6 # USAGE: ./reads_classifier.pl 7 # 8 # DESCRIPTION: 9 # 10 # OPTIONS: --- 11 # REQUIREMENTS: --- 12 # BUGS: --- 13 # NOTES: --- 14 # AUTHOR: YAN Yun (Puriney), yanyun@whu.edu.cn 15 # COMPANY: 16 # VERSION: 1.0 17 # CREATED: 2012年07月17日 15时32分48秒 18 # REVISION: --- 19 #=============================================================================== 20 21 use strict; 22 use warnings; 23 24 my ($inbed, $index, $out) = @ARGV; 25 die <DATA> unless (@ARGV == 3); 26 open IN, "$inbed" || die $!; 27 open OUT , ">$out" || die $!; 28 while (<IN>){ 29 30 chomp; 31 print OUT "$_"; 32 my ($reads_ref, $reads_s, $reads_e, $reads_str) = (split /\t/,$_)[0,1,2,5]; 33 34 my $class = 0; 35 $class = &reads_type_classifer($reads_ref, $reads_s,$reads_e, $reads_str, $index); 36 print OUT "\t$class\n"; 37 } 38 39 40 sub reads_type_classifer{ 41 my ($reads_ref, $reads_s,$reads_e, $reads_str, $index) = @_; 42 #print "ini:$reads_s\t$reads_e\n"; 43 open INDEX, $index; 44 #index format (1-based cordination): 45 #accesion_num, CDS/tRNA, strand, rRNA/Gene, Joint/Single_positioin, position_start1, postion_end1, position_start2(if J), position_end2(if J) 46 47 #position hash, f=forward, r=reverse 48 my (%trnaf,%cdsf,%rrnaf,%trnar,%cdsr,%rrnar); 49 my ($ref, $cort, $str, $rorg, $js); 50 51 #my $classtag = 0; 52 #my $class = "unknow1"; 53 my $go = "Y"; 54 while (<INDEX>){ 55 56 chomp; 57 my @col = split (/\t/,$_); 58 59 #$cort: CDS or tRNA 60 #$rorg: ribosomal RNA or Gene 61 #$js : joint or single 62 ($ref, $cort, $str, $rorg, $js) = (split /\t/,$_)[0,1,2,3,4]; 63 64 if ($reads_ref =~ /$ref/){ 65 66 my ($s, $e) = (0,0); 67 68 #get forward information 69 if ($str ==1){ 70 #get (%trnaf,%cdsf,%rrnaf,) 71 if ($js eq "J"){ 72 if ($cort eq "tRNA"){ #tRNA 73 #parse joint position 74 for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) { 75 ($s, $e)=($col[$list], $col[$list+1]); 76 $trnaf{$s} = $e; 77 } 78 next; 79 } 80 elsif ( $cort eq "CDS"){ #CDS 81 #parse join position 82 for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) { 83 ($s, $e)=($col[$list], $col[$list+1]); 84 $cdsf{$s} = $e; 85 } 86 if ($rorg ne "G"){ #ribosomal RNA 87 #parse joint position 88 for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) { 89 ($s, $e)=($col[$list], $col[$list+1]); 90 $rrnaf{$s} = $e; 91 } 92 } 93 } 94 } 95 elsif ($js eq "S"){ 96 ($s, $e) = (split /\t/,$_)[5,6]; 97 98 #CORE CODES 99 #CLASSIFIER 100 if ($cort eq "tRNA"){ #tRNA 101 $trnaf{$s} = $e; 102 next; 103 } 104 elsif ( $cort eq "CDS"){ #CDS 105 $cdsf{$s} = $e; 106 if ($rorg ne "G"){ #ribosomal RNA 107 $rrnaf{$s} = $e; 108 next; 109 } 110 } 111 112 } 113 } 114 115 #get complementary strand information 116 elsif ($str == -1){ 117 #get %trnar,%cdsr,%rrnar 118 if ($js eq "J"){ 119 if ($cort eq "tRNA"){ #tRNA 120 #parse joint position 121 for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) { 122 ($s, $e)=($col[$list], $col[$list+1]); 123 $trnar{$s} = $e; 124 } 125 next; 126 } 127 elsif ( $cort eq "CDS"){ #CDS 128 #parse join position 129 for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) { 130 ($s, $e)=($col[$list], $col[$list+1]); 131 $cdsr{$s} = $e; 132 } 133 if ($rorg ne "G"){ #ribosomal RNA 134 #parse joint position 135 for ( my $list = 5; $list <= (@col - 1 - 1); $list=$list + 2 ) { 136 ($s, $e)=($col[$list], $col[$list+1]); 137 $rrnar{$s} = $e; 138 } 139 } 140 } 141 } 142 elsif ($js eq "S"){ 143 ($s, $e) = (split /\t/,$_)[5,6]; 144 145 #CORE CODES 146 #CLASSIFIER 147 if ($cort eq "tRNA"){ #tRNA 148 $trnar{$s} = $e; 149 next; 150 } 151 elsif ( $cort eq "CDS"){ #CDS 152 $cdsr{$s} = $e; 153 if ($rorg ne "G"){ #ribosomal RNA 154 $rrnar{$s} = $e; 155 next; 156 } 157 } 158 } 159 160 } 161 162 } 163 else { 164 $go = "N"; 165 print "no kidding me\n"; 166 last; 167 } 168 } 169 170 171 my $class = "initial"; 172 print "$go\n" if ($class eq "initial" and $go eq "N"); 173 my $classtag = 0; 174 if ($go ne "N"){ 175 # now we have (%trnaf,%cdsf,%rrnaf,%trnar,%cdsr,%rrnar); 176 # 177 # decide read's class, tRNA, rRNA, CDS, inter genenic region 178 # 179 #my $classtag = 0; 180 %trnaf = &sorthash(%trnaf); 181 %trnar = &sorthash(%trnar); 182 %cdsf = &sorthash(%cdsf); 183 %cdsr = &sorthash(%cdsr); 184 %trnaf = &sorthash(%trnaf); 185 %trnar = &sorthash(%trnar); 186 if ($reads_str eq "+"){ 187 188 #if tRNA start 189 if ( &withinorout($reads_s, $reads_e, %trnaf) != 0){ 190 #tRNA 191 $classtag = &withinorout($reads_s, $reads_e, %trnaf); 192 #print "tRNA:$reads_s\t$reads_e\n"; 193 if ($classtag == 1){ 194 $class = "tRNA"; 195 } 196 elsif ($classtag == -1) { 197 if (&withinorout($reads_s, $reads_e,%cdsf) == 1){ 198 $class = "CDS"; 199 } 200 else { 201 #$class = "unknow2"; 202 $class = "IGR+1"; 203 } 204 } 205 else { 206 $class = "unknown"; 207 } 208 } 209 #if trna end 210 211 elsif ( &withinorout($reads_s, $reads_e, %cdsf) != 0){ 212 #elsif cds start 213 #CDS 214 $classtag = &withinorout($reads_s, $reads_e, %cdsf); 215 216 if ($classtag ==1){ 217 $class = "CDS+"; 218 219 #rRNA 220 if (&withinorout($reads_s, $reads_e,%rrnaf) == 1){ 221 $class = (split /\s+/, $rorg)[0]; 222 $class = "rRNA-".$class; 223 } 224 elsif (&withinorout($reads_s, $reads_e,%rrnaf) == -1){ 225 $class = "unknow3"; 226 } 227 228 } 229 elsif ($classtag == -1 ){ 230 $class = "IGR+2"; 231 } 232 else { 233 $class = "unknow+4"; 234 } 235 } 236 #elsif cds end 237 238 else { 239 $class = "unkown+5"; 240 } 241 } 242 243 elsif ($reads_str eq "-"){ 244 245 #if tRNA start 246 if (&withinorout($reads_s, $reads_e, %trnar) != 0){ 247 #tRNA 248 $classtag = &withinorout($reads_s, $reads_e, %trnar); 249 if ($classtag == 1){ 250 $class = "tRNA"; 251 } 252 elsif ($classtag == -1) { 253 if (&withinorout($reads_s, $reads_e,%cdsr) == 1){ 254 $class = "CDS"; 255 } 256 else { 257 #$class = "unknow2"; 258 $class = "IGR-1"; 259 } 260 } 261 else { 262 $class = "unknown"; 263 } 264 } 265 #if tRNA end 266 267 268 #if cds start 269 elsif (&withinorout($reads_s, $reads_e, %cdsr) != 0){ 270 #print "cds:$reads_s\t$reads_e\n"; 271 #print &withinorout($reads_s, $reads_e, %cdsr) ,"\n"; 272 =head 273 foreach my $key (sort {$a <=> $b} keys %cdsr){ 274 print "$key\n"; 275 } 276 =cut 277 #CDS 278 $classtag = &withinorout($reads_s, $reads_e, %cdsr); 279 if ($classtag == 1){ 280 $class = "CDS-"; 281 282 #rRNA 283 if (&withinorout($reads_s, $reads_e,%rrnar) == 1){ 284 $class = (split /\s+/, $rorg)[0]; 285 $class = "rRNA-".$class; 286 } 287 elsif (&withinorout($reads_s, $reads_e,%rrnar) == -1){ 288 $class = "unknow3"; 289 } 290 291 } 292 elsif ($classtag == -1 ){ 293 $class = "IGR-2"; 294 } 295 else { 296 $class = "unknow4"; 297 } 298 } 299 #if cds end 300 301 302 else { 303 $class = "unkown-5"; 304 } 305 } 306 } 307 308 elsif ($go eq "N"){ 309 print "no kidding me\n"; 310 die ; 311 } 312 else { 313 $class = "unkown6" 314 } 315 316 $class; 317 } 318 319 320 sub withinorout{ 321 my $tag = 0; 322 my ($s, $e, %hash) = @_; 323 #s = reads start, e = reads end 324 # bed format reads, 0-based cordination 325 $s = $s + 1; 326 $e = $e + 1; 327 328 #rs = reference start, re = reference end 329 # reference genebank, 1-based cordination 330 my ($rs,$re); 331 my $num = (keys %hash); 332 333 for (my $step = 1;$step<=$num;$step++){ 334 335 ($rs, $re) = each %hash; 336 =head 337 if ($step = $num){ 338 $tag = -1 if ($re < $s); 339 last; 340 } 341 =cut 342 #last unless ($rs && $re); 343 # elsif ($step <$num){ 344 if ($rs<=$s and $e<=$re){ 345 #print "$rs\t$s\t$e\t$re\n"; 346 #within 347 348 $tag = 1; 349 last; 350 351 } 352 else { 353 354 #without 355 356 $tag = -1; 357 my $tmp_e = $re; 358 ; 359 } 360 # } 361 # else { 362 # next; 363 # } 364 365 } 366 $tag; 367 } 368 369 sub sorthash{ 370 my %hash = @_; 371 my %sorthash; 372 foreach my $key (sort {$a<=>$b} keys %hash){ 373 $sorthash{$key} = $hash{$key}; 374 } 375 %sorthash; 376 } 377 378 379 __DATA__ 380 $0 <in.bed> <in.homemade.index> <out.plus.bed>
小题大做什么的