1 Biology and Computer Science
2 Getting Started with Perl
3 The Art of Programming
4 Sequences and Strings
5 Motifs and Loops
Example 53. Searching for motifs
#!/usr/bin/perl w
# Searching for motifs
# Ask the user for the filename of the file containing
# the protein sequence data, and collect it from the keyboard
print "Please type the filename of the protein sequence data: ";
$proteinfilename = <STDIN>;
# Remove the newline from the protein filename
chomp $proteinfilename;
# open the file, or exit
unless ( open(PROTEINFILE, $proteinfilename) ) {
print "Cannot open file \"$proteinfilename\"\n\n";
exit;
}
# Read the protein sequence data from the file, and store it
# into the array variable @protein
@protein = <PROTEINFILE>;
# Close the file we've read all the data into @protein now.
close PROTEINFILE;
# Put the protein sequence data into a single string, as it's easier
# to search for a motif in a string than in an array of
# lines (what if the motif occurs over a line break?)
$protein = join( '', @protein);
# Remove whitespace
$protein =~ s/\s//g;
# In a loop, ask the user for a motif, search for the motif,
# and report if it was found.
# Exit if no motif is entered.
do {
print "Enter a motif to search for: ";
$motif = <STDIN>;
# Remove the newline at the end of $motif
chomp $motif;
# Look for the motif
if ( $protein =~ /$motif/ ) {
print "I found it!\n\n";
} else {
print "I couldn\'t find it.\n\n";
}
# exit on an empty user input
} until ( $motif =~ /^\s*$/ );
# exit the program
exit;
#!/usr/bin/perl w
# Determining frequency of nucleotides, take 2
# Get the DNA sequence data
print "Please type the filename of the DNA sequence data: ";
$dna_filename = <STDIN>;
chomp $dna_filename;
# Does the file exist?
unless ( e $dna_filename) {
print "File \"$dna_filename\" doesn\'t seem to exist!!\n";
exit;
}
# Can we open the file?
unless ( open(DNAFILE, $dna_filename) ) {
print "Cannot open file \"$dna_filename\"\n\n";
exit;
}
@DNA = <DNAFILE>;
close DNAFILE;
$DNA = join( '', @DNA);
# Remove whitespace
$DNA =~ s/\s//g;
# Initialize the counts.
# Notice that we can use scalar variables to hold numbers.
$count_of_A = 0;
$count_of_C = 0;
$count_of_G = 0;
$count_of_T = 0;
$errors = 0;
# In a loop, look at each base in turn, determine which of the
# four types of nucleotides it is, and increment the
# appropriate count.
for ( $position = 0 ; $position < length $DNA ; ++$position ) {
$base = substr($DNA, $position, 1);
if ( $base eq 'A' ) {
++$count_of_A;
} elsif ( $base eq 'C' ) {
++$count_of_C;
} elsif ( $base eq 'G' ) {
++$count_of_G;
} elsif ( $base eq 'T' ) {
++$count_of_T;
} else {
print "!!!!!!!! Error I don\'t recognize this base:
$base\n";
++$errors;
}
}
# print the results
print "A = $count_of_A\n";
print "C = $count_of_C\n";
print "G = $count_of_G\n";
print "T = $count_of_T\n";
print "errors = $errors\n";
# exit the program
exit;
Here's the output of Example 56:
Please type the filename of the DNA sequence data: small.dna
!!!!!!!! Error I don't recognize this vase: V
A = 40
C = 27
G = 24
T = 17
errors = 1
6 Subroutines and Bugs
7 Mutations and Randomization
8 The Genetic Code
9 Restriction Maps and Regular Expressions
10 GenBank
Extract annotation and sequence from GenBank file
#!/usr/bin/perl
# Extract annotation and sequence from GenBank file
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# declare and initialize variables
my @annotation = ( );
my $sequence = '';
my $filename = 'record.gb';
parse1(\@annotation, \$sequence, $filename);
# Print the annotation, and then
# print the DNA in new format just to check if we got it okay.
print @annotation;
print_sequence($sequence, 50);
exit;
#######################
sub parse1 {
my($annotation, $dna, $filename) = @_;
# $annotationreference to array
# $dna
reference to scalar
# $filename scalar
# declare and initialize variables
my $in_sequence = 0;
my @GenBankFile = ( );
# Get the GenBank data into an array from a file
@GenBankFile = get_file_data($filename);
# Extract all the sequence lines
foreach my $line (@GenBankFile) {
if( $line =~ /^\/\/\n/ ) { # If $line is endofrecord line
//\n,
last; #break out of the foreach loop.
} elsif( $in_sequence) { # If we know we're in a sequence,
$$dna .= $line; # add the current line to $$dna.
} elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence,
$in_sequence = 1; # set the $in_sequence flag.
} else{ # Otherwise
push( @$annotation, $line); # add the current line to
@annotation.
}
}
# remove whitespace and line numbers from DNA sequence
$$dna =~ s/[\s09]//g;
}
Extract annotation and sequence from Genbank record
#!/usr/bin/perl
# Extract the annotation and sequence sections from the first
# record of a GenBank library
use strict;
use warnings;
use BeginPerlBioinfo;
# Declare and initialize variables
my $annotation = '';
my $dna = '';
my $record = '';
my $filename = 'record.gb';
my $save_input_separator = $/;
# Open GenBank library file
unless (open(GBFILE, $filename)) {
print "Cannot open GenBank file \"$filename\"\n\n";
exit;
}
# Set input separator to "//\n" and read in a record to a scalar
$/ = "//\n";
$record = <GBFILE>;
# reset input separator
$/ = $save_input_separator;
# Now separate the annotation from the sequence data
($annotation, $dna) = ($record =~ /^(LOCUS.*ORIGIN\s*\n)(.*)\/\/\n/s);
# Print the two pieces, which should give us the same as the
# original GenBank file, minus the // at the end
print $annotation, $dna;
exit;
Parsing GenBank annotations using arrays
#!/usr/bin/perl
# Parsing GenBank annotations using arrays
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my @genbank = ( );
my $locus = '';
my $accession = '';
my $organism = '';
# Get GenBank file data
@genbank = get_file_data('record.gb');
# Let's start with something simple. Let's get some of the
identifying
# information, let's say the locus and accession number (here the
same
# thing) and the definition and the organism.
for my $line (@genbank) {
if($line =~ /^LOCUS/) {
$line =~ s/^LOCUS\s*//;
$locus = $line;
}elsif($line =~ /^ACCESSION/) {
$line =~ s/^ACCESSION\s*//;
$accession = $line;
}elsif($line =~ /^ ORGANISM/) {
$line =~ s/^\s*ORGANISM\s*//;
$organism = $line;
}
}
print "*** LOCUS ***\n";
print $locus;
print "*** ACCESSION ***\n";
print $accession;
print "*** ORGANISM ***\n";
print $organism;
exit;
Listing the contents of a folder (or directory)
#!/usr/bin/perl
# Demonstrating how to open a folder and list its contents
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
my @files = ( );
my $folder = 'pdb';
# open the folder
unless(opendir(FOLDER, $folder)) {
print "Cannot open folder $folder!\n";
exit;
}
# read the contents of the folder (i.e. the files and subfolders)
@files = readdir(FOLDER);
# close the folder
closedir(FOLDER);
# print them out, one per line
print join( "\n", @files), "\n";
exit;
Extract sequence chains from PDB file
#!/usr/bin/perl
# Extract sequence chains from PDB file
use strict;
use warnings;
use BeginPerlBioinfo;
# Read in PDB file: Warning some files are very large!
my @file = get_file_data('pdb/c1/pdb1c1f.ent');
# Parse the record types of the PDB file
my %recordtypes = parsePDBrecordtypes(@file);
# Extract the amino acid sequences of all chains in the protein
my @chains = extractSEQRES( $recordtypes{'SEQRES'} );
# Translate the 3character codes to 1character codes, and print
foreach my $chain (@chains) {
print "****chain $chain **** \n";
print "$chain\n";
print iub3to1($chain), "\n";
}
exit;
# parsePDBrecordtypes
#
#given an array of a PDB file, return a hash with
# keys = record type names
# values = scalar containing lines for that record type
sub parsePDBrecordtypes {
my @file = @_;
use strict;
use warnings;
my %recordtypes = ( );
foreach my $line (@file) {
# Get the record type name which begins at the
# start of the line and ends at the first space
my($recordtype) = ($line =~ /^(\S+)/);
# .= fails if a key is undefined, so we have to
# test for definition and use either .= or = depending
if(defined $recordtypes{$recordtype} ) {
$recordtypes{$recordtype} .= $line;
}else{
$recordtypes{$recordtype} = $line;
}
}
return %recordtypes;
}
# extractSEQRES
#
#given an scalar containing SEQRES lines,
# return an array containing the chains of the sequence
sub extractSEQRES {
use strict;
use warnings;
my($seqres) = @_;
my $lastchain = '';
my $sequence = '';
my @results = ( );
# make array of lines
my @record = split ( /\n/, $seqres);
foreach my $line (@record) {
# Chain is in column 12, residues start in column 20
my ($thischain) = substr($line, 11, 1);
my($residues) = substr($line, 19, 52); # add space at end
# Check if a new chain, or continuation of previous chain
if("$lastchain" eq "") {
$sequence = $residues;
}elsif("$thischain" eq "$lastchain") {
$sequence .= $residues;
# Finish gathering previous chain (unless first record)
}elsif ( $sequence ) {
push(@results, $sequence);
$sequence = $residues;
}
$lastchain = $thischain;
}
# save last chain
push(@results, $sequence);
return @results;
}
# iub3to1
#
#change string of 3character IUB amino acid codes (whitespace
separated)
# into a string of 1character amino acid codes
sub iub3to1 {
my($input) = @_;
my %three2one = (
'ALA' => 'A',
'VAL' => 'V',
'LEU' => 'L',
'ILE' => 'I',
'PRO' => 'P',
'TRP' => 'W',
'PHE' => 'F',
'MET' => 'M',
'GLY' => 'G',
'SER' => 'S',
'THR' => 'T',
'TYR' => 'Y',
'CYS' => 'C',
'ASN' => 'N',
'GLN' => 'Q',
'LYS' => 'K',
'ARG' => 'R',
'HIS' => 'H',
'ASP' => 'D',
'GLU' => 'E',
);
# clean up the input
$input =~ s/\n/ /g;
my $seq = '';
# This use of split separates on any contiguous whitespace
my @code3 = split(' ', $input);
foreach my $code (@code3) {
# A little error checking
if(not defined $three2one{$code}) {
print "Code $code not defined\n";
next;
}
$seq .= $three2one{$code};
}
return $seq;
}
12 BLAST
Extract annotation and alignments from BLAST output file
#!/usr/bin/perl
# Extract annotation and alignments from BLAST output file
use strict;
use warnings;
use BeginPerlBioinfo;
# declare and initialize variables
my $beginning_annotation = '';
my $ending_annotation = '';
my %alignments = ( );
my $filename = 'blast.txt';
parse_blast(\$beginning_annotation, \$ending_annotation, \%alignments,
$filename);
# Print the annotation, and then
# print the DNA in new format just to check if we got it okay.
print $beginning_annotation;
foreach my $key (keys %alignments) {
print "$key\nXXXXXXXXXXXX\n", $alignments{$key},
"\nXXXXXXXXXXX\n";
}
print $ending_annotation;
exit;
# parse_blast
#
# parse beginning and ending annotation, and alignments,
# from BLAST output file
sub parse_blast {
my($beginning_annotation, $ending_annotation, $alignments,
$filename) = @_;
# $beginning_annotationreference to scalar
# $ending_annotation reference to scalar
# $alignments
reference to hash
# $filename
scalar
# declare and initialize variables
my $blast_output_file = '';
my $alignment_section = '';
# Get the BLAST program output into an array from a file
$blast_output_file = join( '', get_file_data($filename));
# Extract the beginning annotation, alignments, and ending
annotation
($$beginning_annotation, $alignment_section, $$ending_annotation)
= ($blast_output_file =~ /(.*^ALIGNMENTS\n)(.*)(^
Database:.*)/ms);
# Populate %alignments hash
# key = ID of hit
# value = alignment section
%$alignments = parse_blast_alignment($alignment_section);
}
# parse_blast_alignment
#
# parse the alignments from a BLAST output file,
# return hash with
# key = ID
# value = text of alignment
sub parse_blast_alignment {
my($alignment_section) = @_;
# declare and initialize variables
my(%alignment_hash) = ( );
# loop through the scalar containing the BLAST alignments,
# extracting the ID and the alignment and storing in a hash
#
# The regular expression matches a line beginning with >,
# and containing the ID between the first pair of | characters;
# followed by any number of lines that don't begin with >
while($alignment_section =~ /^>.*\n(^(?!>).*\n)+/gm) {
my($value) = $&;
my($key) = (split(/\|/, $value)) [1];
$alignment_hash{$key} = $value;
}
return %alignment_hash;
}
13 Further Topics