程序里面有很多小的知识点,大家要认真的看,才能发现:
a.fasta的DNA序列如下:
> sample dna | (This is a typical fasta header.)
agatggcggcgctgaggggtcttgggggctctaggccggccacctactgg
tttgcagcggagacgacgcatggggcctgcgcaataggagtacgctgcct
gggaggcgtgactagaagcggaagtagttgtgggcgcctttgcaaccgcc
tgggacgccgccgagtggtctgtgcaggttcgcgggtcgctggcgggggt
cgtgagggagtgcgccgggagcggagatatggagggagatggttcagacc
cagagcctccagatgccggggaggacagcaagtccgagaatggggagaat
gcgcccatctactgcatctgccgcaaaccggacatcaactgcttcatgat
cgggtgtgacaactgcaatgagtggttccatggggactgcatccggatca
ctgagaagatggccaaggccatccgggagtggtactgtcgggagtgcaga
gagaaagaccccaagctagagattcgctatcggcacaagaagtcacggga
gcgggatggcaatgagcgggacagcagtgagccccgggatgagggtggag
ggcgcaagaggcctgtccctgatccagacctgcagcgccgggcagggtca
gggacaggggttggggccatgcttgctcggggctctgcttcgccccacaa
atcctctccgcagcccttggtggccacacccagccagcatcaccagcagc
agcagcagcagatcaaacggtcagcccgcatgtgtggtgagtgtgaggca
tgtcggcgcactgaggactgtggtcactgtgatttctgtcgggacatgaa
gaagttcgggggccccaacaagatccggcagaagtgccggctgcgccagt
gccagctgcgggcccgggaatcgtacaagtacttcccttcctcgctctca
ccagtgacgccctcagagtccctgccaaggccccgccggccactgcccac
ccaacagcagccacagccatcacagaagttagggcgcatccgtgaagatg
agggggcagtggcgtcatcaacagtcaaggagcctcctgaggctacagcc
acacctgagccactctcagatgaggaccta
REBASE.txt的内容如下:
REBASE version 104
bionet.104
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
REBASE, The Restriction Enzyme Database http://rebase.neb.com
Copyright (c) Dr. Richard J. Roberts, 2001. All rights reserved.
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Rich Roberts Mar 30 2001
AaaI (XmaIII) C^GGCCG
AacI (BamHI) GGATCC
AaeI (BamHI) GGATCC
AagI (ClaI) AT^CGAT
AaqI (ApaLI) GTGCAC
AarI CACCTGCNNNN^
AarI ^NNNNNNNNGCAGGTG
AatI (StuI) AGG^CCT
AatII GACGT^C
AauI (Bsp1407I) T^GTACA
AbaI (BclI) T^GATCA
AbeI (BbvCI) CC^TCAGC
AbeI (BbvCI) GC^TGAGG
AbrI (XhoI) C^TCGAG
AcaI (AsuII) TTCGAA
AcaII (BamHI) GGATCC
AcaIII (MstI) TGCGCA
AcaIV (HaeIII) GGCC
AccI GT^MKAC
AccII (FnuDII) CG^CG
AccIII (BspMII) T^CCGGA
Acc16I (MstI) TGC^GCA
Acc36I (BspMI) ACCTGCNNNN^
Acc36I (BspMI) ^NNNNNNNNGCAGGT
Acc38I (EcoRII) CCWGG
Acc65I (KpnI) G^GTACC
Acc113I (ScaI) AGT^ACT
AccB1I (HgiCI) G^GYRCC
AccB2I (HaeII) RGCGC^Y
AccB7I (PflMI) CCANNNN^NTGG
AccBSI (BsrBI) CCG^CTC
AccBSI (BsrBI) GAG^CGG
AccEBI (BamHI) G^GATCC
AceI (TseI) G^CWGC
AceII (NheI) GCTAG^C
AceIII CAGCTCNNNNNNN^
AceIII ^NNNNNNNNNNNGAGCTG
AciI C^CGC
AciI G^CGG
AclI AA^CGTT
AclNI (SpeI) A^CTAGT
AclWI (BinI) GGATCNNNN^
这里面要求两次输入要读取的文件,第一次读取的是a.fasta。第二次读取的是REBASE.txt
sub IUB_to_regexp
{
# A subroutine that, given a sequence with IUB ambiguity codes,
# outputs a translation with IUB codes changed to regular expressions
# These are the IUB ambiguity codes
my($iub) = @_;
my $regular_expression = '';
my %iub2character_class =
(
# 这里除了四种常用的碱基外,用来表明核酸序列中不常见或不明确的碱基
A => 'A',
C => 'C',
G => 'G',
T => 'T',
R => '[GA]',#R可以代表G或者A中一种
Y => '[CT]',
M => '[AC]',
K => '[GT]',
S => '[GC]',
W => '[AT]',
B => '[CGT]',
D => '[AGT]',
H => '[ACT]',
V => '[ACG]',
N => '[ACGT]',
);
# Remove the ^ signs from the recognition sites
$iub =~ s/\^//g;
# Translate each character in the iub sequence
for ( my $i = 0 ; $i < length($iub) ; ++$i )
{
$regular_expression .= $iub2character_class{substr($iub, $i, 1)};
}
return $regular_expression;
}
sub get_file_data
{
# A subroutine to get data from a file given its filename
#读取文件的子序列
my $dna_filename;
my @filedata;
print "please input the Path just like this f:\\\\perl\\\\data.txt\n";
chomp($dna_filename=<STDIN>);
open(DNAFILENAME,$dna_filename)||die("can not open the file!");
@filedata = <DNAFILENAME>;
close DNAFILENAME;
return @filedata;#子函数的返回值一定要记住写
}
sub parseREBASE
{
my($rebasefile) = @_;
use strict;
use warnings;
my @rebasefile = ( );
my %rebase_hash = ( );
my $name;
my $site;
my $regexp;
# Read in the REBASE file
@rebasefile = get_file_data($rebasefile);
foreach ( @rebasefile )
{
# Discard header lines
( 1 .. /Rich Roberts/ ) and next;
# Discard blank lines
/^\s*$/ and next;
# Split the two (or three if includes parenthesized name) fields
my @fields = split( " ", $_);
# Get and store the name and the recognition site
# by not saving the middle field, if any,
# just the first and last
$name = $fields[0];
$site = $fields[-1];
# Translate the recognition sites to regular expressions
$regexp = IUB_to_regexp($site);
# Store the data into the hash
# $site 表示位点序列,$regexp 表示位点的可以和DNA序列匹配的位点序列
$rebase_hash{$name} = "$site $regexp";
}
# Return the hash containing the reformatted REBASE data
return %rebase_hash;
}
sub extract_sequence_from_fasta_data
{
#*******************************************************************
# A subroutine to extract FASTA sequence data from an array
# 得到其中的序列
# fasta格式介绍:
# 包括三个部分
# 1.第一行中以>开头的注释行,后面是名称和序列的来源
# 2.标准单字母符号的序列
# 3.*表示结尾
#*******************************************************************
my (@fasta_file_data) =@_;
my $sequence =' ';
foreach my $line (@fasta_file_data)
{
#这里忽略空白行
if ($line=~/^\s*$/)
{
next;
}
#忽略注释行
elsif($line=~/^\s*#/)
{
next;
}
#忽略fasta的第一行
elsif($line=~/^>/)
{
next;
}
else
{
$sequence .=$line;
}
}
$sequence=~s/\s//g;
return $sequence;
}
sub match_positions
{
my ($regexp,$sequence) = @_;
use strict;
my @positions =( );
while($sequence=~/$regexp/ig)
{
push (@positions,pos($sequence)-length($&)+1)
# pos返回最后一次匹配的位置
# $&代表匹配的位置,$`代表匹配位置之前的位置,$'代表匹配位置之后的位置
}
return @positions;
}
use strict;
use warnings;
my %rebase_hash = ( );
my @file_data = ( );
my $query = '';
my $dna = '';
my $recognition_site= '';
my $regexp = '';
my @locations = ( );
@file_data = get_file_data( );
$dna = extract_sequence_from_fasta_data(@file_data);
%rebase_hash = parseREBASE();
do
{
print "Please input restriction enzyme name\n";
chomp($query=<STDIN>);
if ($query=~/^\s*$/)
{
exit;
}
if ($rebase_hash{$query})
{
if ($rebase_hash{$query})
{
($recognition_site,$regexp) = split (" ",$rebase_hash{$query});
@locations = match_positions($regexp,$dna);
if (@locations)
{
print "searching for $query $recognition_site $regexp\n";
print "A restriction site for $query at locations:\n";
print join(" ",@locations),"\n";
}
else
{
print "A restriction site for $query is not in the DNA:\n";
}
}
print "\n";
}
}
until ($query=~/quit/);
最后的 结果如下:
F:\>perl\a.pl
please input the Path just like this f:\\perl\\data.txt
f:\\perl\\a.fasta
please input the Path just like this f:\\perl\\data.txt
f:\\perl\\REBASE.txt
Please input restriction enzyme name
AbeI
searching for AbeI GC^TGAGG GCTGAGG
A restriction site for AbeI at locations:
11
Please input restriction enzyme name