Beginning Perl for Bioinformatics 总结提升

saghir

于 2015-06-15 00:01:50 发布

阅读量1.3k

点赞数

分类专栏： Perl

Perl 专栏收录该内容

5 篇文章 0 订阅

订阅专栏


1 Biology and Computer Science
2 Getting Started with Perl
3 The Art of Programming
4 Sequences and Strings


5 Motifs and Loops
Example 53. Searching for motifs 
#!/usr/bin/perl w 
# Searching for motifs 
# Ask the user for the filename of the file containing 
# the protein sequence data, and collect it from the keyboard 
print "Please type the filename of the protein sequence data: ";
$proteinfilename = <STDIN>; 
# Remove the newline from the protein filename 
chomp $proteinfilename; 
# open the file, or exit 
unless ( open(PROTEINFILE, $proteinfilename) ) { 
print "Cannot open file \"$proteinfilename\"\n\n"; 
exit; 
} 
# Read the protein sequence data from the file, and store it 
# into the array variable @protein 
@protein = <PROTEINFILE>; 
# Close the file  we've read all the data into @protein now. 
close PROTEINFILE; 
# Put the protein sequence data into a single string, as it's easier 
# to search for a motif in a string than in an array of 
# lines (what if the motif occurs over a line break?) 
$protein = join( '', @protein); 
# Remove whitespace 
$protein =~ s/\s//g; 
# In a loop, ask the user for a motif, search for the motif, 
# and report if it was found. 
# Exit if no motif is entered. 
do { 
print "Enter a motif to search for: "; 
$motif = <STDIN>; 
# Remove the newline at the end of $motif 
chomp $motif; 
# Look for the motif 
if ( $protein =~ /$motif/ ) { 
print "I found it!\n\n"; 
} else { 
print "I couldn\'t find it.\n\n"; 
}
# exit on an empty user input 
} until ( $motif =~ /^\s*$/ ); 
# exit the program 
exit; 

#!/usr/bin/perl w 
# Determining frequency of nucleotides, take 2 
# Get the DNA sequence data 
print "Please type the filename of the DNA sequence data: "; 
$dna_filename = <STDIN>; 
chomp $dna_filename; 
# Does the file exist? 
unless ( e $dna_filename) { 
print "File \"$dna_filename\" doesn\'t seem to exist!!\n"; 
exit; 
} 
# Can we open the file? 
unless ( open(DNAFILE, $dna_filename) ) { 
print "Cannot open file \"$dna_filename\"\n\n"; 
exit; 
} 
@DNA = <DNAFILE>; 
close DNAFILE; 
$DNA = join( '', @DNA); 
# Remove whitespace 
$DNA =~ s/\s//g; 
# Initialize the counts. 
# Notice that we can use scalar variables to hold numbers. 
$count_of_A = 0; 
$count_of_C = 0; 
$count_of_G = 0; 
$count_of_T = 0; 
$errors     = 0; 
# In a loop, look at each base in turn, determine which of the 
# four types of nucleotides it is, and increment the 
# appropriate count. 
for ( $position = 0 ; $position < length $DNA ; ++$position ) { 
$base = substr($DNA, $position, 1);
if     ( $base eq 'A' ) { 
++$count_of_A; 
} elsif ( $base eq 'C' ) { 
++$count_of_C; 
} elsif ( $base eq 'G' ) { 
++$count_of_G; 
} elsif ( $base eq 'T' ) { 
++$count_of_T; 
} else { 
print "!!!!!!!! Error  I don\'t recognize this base: 
$base\n"; 
++$errors; 
} 
} 
# print the results 
print "A = $count_of_A\n"; 
print "C = $count_of_C\n"; 
print "G = $count_of_G\n"; 
print "T = $count_of_T\n"; 
print "errors = $errors\n"; 
# exit the program 
exit; 
Here's the output of Example 56: 
Please type the filename of the DNA sequence data: small.dna 
!!!!!!!! Error  I don't recognize this vase: V 
A = 40 
C = 27 
G = 24 
T = 17 
errors = 1 



6 Subroutines and Bugs
7 Mutations and Randomization
8 The Genetic Code
9 Restriction Maps and Regular Expressions

10 GenBank
Extract annotation and sequence from GenBank file 
#!/usr/bin/perl 
# Extract annotation and sequence from GenBank file 
use strict; 
use warnings; 
use BeginPerlBioinfo;     # see Chapter 6 about this module 
# declare and initialize variables 
my @annotation = (  ); 
my $sequence = ''; 
my $filename = 'record.gb'; 
parse1(\@annotation, \$sequence, $filename); 
# Print the annotation, and then 
#   print the DNA in new format just to check if we got it okay. 
print @annotation; 
print_sequence($sequence, 50); 
exit; 
#######################
sub parse1 { 
my($annotation, $dna, $filename) = @_; 
# $annotationreference to array 
# $dna 
reference to scalar 
# $filename  scalar 
# declare and initialize variables 
my $in_sequence = 0; 
my @GenBankFile = (  ); 
# Get the GenBank data into an array from a file 
@GenBankFile = get_file_data($filename); 
# Extract all the sequence lines 
foreach my $line (@GenBankFile) { 
if( $line =~ /^\/\/\n/ ) { # If $line is endofrecord line 
//\n, 
last; #break out of the foreach loop. 
} elsif( $in_sequence) { # If we know we're in a sequence, 
$$dna .= $line; # add the current line to $$dna. 
} elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence, 
$in_sequence = 1; # set the $in_sequence flag. 
} else{ # Otherwise 
push( @$annotation, $line); # add the current line to 
@annotation. 
} 
} 
# remove whitespace and line numbers from DNA sequence 
$$dna =~ s/[\s09]//g; 
} 

Extract annotation and sequence from Genbank record 
#!/usr/bin/perl 
# Extract the annotation and sequence sections from the first 
#   record of a GenBank library 
use strict; 
use warnings; 
use BeginPerlBioinfo;
# Declare and initialize variables 
my $annotation = ''; 
my $dna = ''; 
my $record = ''; 
my $filename = 'record.gb'; 
my $save_input_separator = $/; 
# Open GenBank library file 
unless (open(GBFILE, $filename)) { 
print "Cannot open GenBank file \"$filename\"\n\n"; 
exit; 
} 
# Set input separator to "//\n" and read in a record to a scalar 
$/ = "//\n"; 
$record = <GBFILE>; 
# reset input separator 
$/ = $save_input_separator; 
# Now separate the annotation from the sequence data 
($annotation, $dna) = ($record =~ /^(LOCUS.*ORIGIN\s*\n)(.*)\/\/\n/s); 
# Print the two pieces, which should give us the same as the 
#  original GenBank file, minus the // at the end 
print $annotation, $dna; 
exit;

Parsing GenBank annotations using arrays 
#!/usr/bin/perl 
# Parsing GenBank annotations using arrays 
use strict; 
use warnings; 
use BeginPerlBioinfo;     # see Chapter 6 about this module 
# Declare and initialize variables 
my @genbank = (  ); 
my $locus = ''; 
my $accession = ''; 
my $organism = ''; 
# Get GenBank file data 
@genbank = get_file_data('record.gb');

# Let's start with something simple.  Let's get some of the 
identifying 
# information, let's say the locus and accession number (here the 
same 
# thing) and the definition and the organism. 
for my $line (@genbank) { 
if($line =~ /^LOCUS/) { 
$line =~ s/^LOCUS\s*//; 
$locus = $line; 
}elsif($line =~ /^ACCESSION/) { 
$line =~ s/^ACCESSION\s*//; 
$accession = $line; 
}elsif($line =~ /^  ORGANISM/) { 
$line =~ s/^\s*ORGANISM\s*//; 
$organism = $line; 
} 
} 
print "*** LOCUS ***\n"; 
print $locus; 
print "*** ACCESSION ***\n"; 
print $accession; 
print "*** ORGANISM ***\n"; 
print $organism; 
exit; 


Listing the contents of a folder (or directory) 
#!/usr/bin/perl 
#   Demonstrating how to open a folder and list its contents 
use strict; 
use warnings; 
use BeginPerlBioinfo;     # see Chapter 6 about this module 
my @files = (  ); 
my $folder = 'pdb'; 
# open the folder 
unless(opendir(FOLDER, $folder)) { 
print "Cannot open folder $folder!\n"; 
exit; 
} 
# read the contents of the folder (i.e. the files and subfolders) 
@files = readdir(FOLDER); 
# close the folder 
closedir(FOLDER); 
# print them out, one per line 
print join( "\n", @files), "\n"; 
exit; 


Extract sequence chains from PDB file 
#!/usr/bin/perl 
#  Extract sequence chains from PDB file 
use strict; 
use warnings; 
use BeginPerlBioinfo;

# Read in PDB file:  Warning  some files are very large! 
my @file = get_file_data('pdb/c1/pdb1c1f.ent'); 
# Parse the record types of the PDB file 
my %recordtypes = parsePDBrecordtypes(@file); 
# Extract the amino acid sequences of all chains in the protein 
my @chains = extractSEQRES( $recordtypes{'SEQRES'} ); 
# Translate the 3character codes to 1character codes, and print 
foreach my $chain (@chains) { 
print "****chain $chain **** \n";

print "$chain\n"; 
print iub3to1($chain), "\n"; 
} 
exit;


# parsePDBrecordtypes 
# 
#given an array of a PDB file, return a hash with 
#    keys   = record type names 
#    values = scalar containing lines for that record type 
sub parsePDBrecordtypes { 
my @file = @_; 
use strict; 
use warnings; 
my %recordtypes = (  ); 
foreach my $line (@file) { 
# Get the record type name which begins at the 
# start of the line and ends at the first space 
my($recordtype) = ($line =~ /^(\S+)/); 
# .= fails if a key is undefined, so we have to 
# test for definition and use either .= or = depending 
if(defined $recordtypes{$recordtype} ) { 
$recordtypes{$recordtype} .= $line; 
}else{ 
$recordtypes{$recordtype} = $line; 
} 
} 
return %recordtypes; 
} 
# extractSEQRES 
# 
#given an scalar containing SEQRES lines, 
#    return an array containing the chains of the sequence


sub extractSEQRES { 
use strict; 
use warnings; 
my($seqres) = @_; 
my $lastchain = ''; 
my $sequence = ''; 
my @results = (  ); 
# make array of lines 
my @record = split ( /\n/, $seqres); 
foreach my $line (@record) { 
# Chain is in column 12, residues start in column 20 
my ($thischain) = substr($line, 11, 1); 
my($residues)  = substr($line, 19, 52); # add space at end 
# Check if a new chain, or continuation of previous chain 
if("$lastchain" eq "") { 
$sequence = $residues; 
}elsif("$thischain" eq "$lastchain") { 
$sequence .= $residues; 
# Finish gathering previous chain (unless first record) 
}elsif ( $sequence ) { 
push(@results, $sequence); 
$sequence = $residues; 
} 
$lastchain = $thischain; 
} 
# save last chain 
push(@results, $sequence); 
return @results; 
} 
# iub3to1 
# 
#change string of 3character IUB amino acid codes (whitespace 
separated) 
#    into a string of 1character amino acid codes 
sub iub3to1 { 
my($input) = @_;

my %three2one = ( 
'ALA' => 'A', 
'VAL' => 'V', 
'LEU' => 'L', 
'ILE' => 'I', 
'PRO' => 'P', 
'TRP' => 'W', 
'PHE' => 'F', 
'MET' => 'M', 
'GLY' => 'G', 
'SER' => 'S', 
'THR' => 'T', 
'TYR' => 'Y', 
'CYS' => 'C', 
'ASN' => 'N', 
'GLN' => 'Q', 
'LYS' => 'K', 
'ARG' => 'R', 
'HIS' => 'H', 
'ASP' => 'D', 
'GLU' => 'E', 
); 
# clean up the input 
$input =~ s/\n/ /g; 
my $seq = ''; 
# This use of split separates on any contiguous whitespace 
my @code3 = split(' ', $input); 
foreach my $code (@code3) { 
# A little error checking 
if(not defined $three2one{$code}) { 
print "Code $code not defined\n"; 
next; 
} 
$seq .= $three2one{$code}; 
} 
return $seq; 
} 


12 BLAST
Extract annotation and alignments from BLAST output file 
#!/usr/bin/perl 
# Extract annotation and alignments from BLAST output file 
use strict; 
use warnings; 
use BeginPerlBioinfo;     
# declare and initialize variables
my $beginning_annotation = ''; 
my $ending_annotation = ''; 
my %alignments = (  ); 
my $filename = 'blast.txt'; 
parse_blast(\$beginning_annotation, \$ending_annotation, \%alignments, 
$filename); 
# Print the annotation, and then 
#   print the DNA in new format just to check if we got it okay. 
print $beginning_annotation; 
foreach my $key (keys %alignments) { 
print "$key\nXXXXXXXXXXXX\n", $alignments{$key}, 
"\nXXXXXXXXXXX\n"; 
} 
print $ending_annotation; 
exit; 

# parse_blast 
# 
# parse beginning and ending annotation, and alignments, 
#     from BLAST output file 
sub parse_blast { 
my($beginning_annotation, $ending_annotation, $alignments, 
$filename) = @_; 
# $beginning_annotationreference to scalar 
# $ending_annotation  reference to scalar 
# $alignments 
reference to hash 
# $filename 
scalar 
# declare and initialize variables 
my $blast_output_file = ''; 
my $alignment_section = ''; 
# Get the BLAST program output into an array from a file 
$blast_output_file = join( '', get_file_data($filename));

# Extract the beginning annotation, alignments, and ending 
annotation 
($$beginning_annotation, $alignment_section, $$ending_annotation) 
= ($blast_output_file =~ /(.*^ALIGNMENTS\n)(.*)(^ 
Database:.*)/ms); 
# Populate %alignments hash 
# key = ID of hit 
# value = alignment section 
%$alignments = parse_blast_alignment($alignment_section); 
} 
# parse_blast_alignment 
# 
# parse the alignments from a BLAST output file, 
#       return hash with 
#       key = ID 
#       value = text of alignment 
sub parse_blast_alignment { 
my($alignment_section) = @_; 
# declare and initialize variables 
my(%alignment_hash) = (  ); 
# loop through the scalar containing the BLAST alignments, 
# extracting the ID and the alignment and storing in a hash 
# 
# The regular expression matches a line beginning with >, 
# and containing the ID between the first pair of | characters; 
# followed by any number of lines that don't begin with > 
while($alignment_section =~ /^>.*\n(^(?!>).*\n)+/gm) { 
my($value) = $&; 
my($key) = (split(/\|/, $value)) [1]; 
$alignment_hash{$key} = $value; 
} 
return %alignment_hash; 
}


13 Further Topics