For explanation to linear space sequence alignment, please refer to http://ai.stanford.edu/~serafim/CS262_2007/notes/lecture3.pdf.
The algorithm and equation I used was from the textbook Algorithm Design by Jon Kleinberg and Éva Tardos(2005):
The algorithm of Backward-Space-Efficient-Alignment() is the reverse of Space-Efficient-Alignment().
Time cost of sequences from 10bp-510bp.
Memory cost of sequences from 10bp-510bp.
#include <iostream>
#include <fstream>
#include <cstring>
#include <cmath>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
using namespace std;
const int MATCH_COST=0;
const int MISMATCH_COST=2;
const int GAP_COST=1;
int memcnt=0; // counting memory allocated
struct BTNode
{
int x; // x axis
int y; // y axis
char type; // 'm'-match,'s'-mismatch,'g'-gap
char nt; // valid neucleotide: x-sequence x, y-sequence y, b-both
};
BTNode* P;
int pidx=0; // index of P
struct Region // region in a sequence
{
int start;
int end;
};
//
// Generate random sequence
//
char* GenRandSeq(char* seq, int n)
{
string nt="ATCG";
for(int i=0;i<n;++i)
{
int idx=rand()%4;
seq[i]=nt[idx];
}
return seq;
}
int min(int a,int b,int c)
{
return (a<b?a:b)<c?(a<b?a:b):c;
}
void Merge(BTNode* input, long p, long r)
{
long mid = floor((p + r) / 2);
long i1 = 0;
long i2 = p;
long i3 = mid + 1;
// Temp array
BTNode* temp=new BTNode[r-p+1];
memcnt+=sizeof(BTNode)*(r-p+1);
// Merge in sorted form the 2 arrays
while ( i2 <= mid && i3 <= r )
if ( input[i2].x < input[i3].x )
temp[i1++] = input[i2++];
else
temp[i1++] = input[i3++];
// Merge the remaining elements in left array
while ( i2 <= mid )
temp[i1++] = input[i2++];
// Merge the remaining elements in right array
while ( i3 <= r )
temp[i1++] = input[i3++];
// Move from temp array to master array
for ( int i = p; i <= r; i++ )
input[i] = temp[i-p];
delete [] temp;
}
// Merge sort the BTNode array by the first index(element x)
//
// inputs:
// p - the start index of array input
// r - the end index of array input
void Merge_sort(BTNode* input, long p, long r)
{
if ( p < r )
{
long mid = floor((p + r) / 2);
Merge_sort(input, p, mid);
Merge_sort(input, mid + 1, r);
Merge(input, p, r);
}
}
//
// Insert new node into P
//
void InsertNodetoP(int x, int y)
{
bool isok=true;
for(int i=0;i<pidx;++i)
{
if(x==P[i].x && y==P[i].y)
isok=false;
}
if(isok)
{
P[pidx].x=x;
P[pidx].y=y;
pidx++;
}
}
//
// Align seqences
//
// Recurrence:
// OPT(i,j)=min{
// alpha(i,j)+OPT(i-1,j-1),
// delta+OPT(i-1,j),
// delta+OPT(i,j-1)
// }
// where mismatch cost alpha=2, match cost alpha=0, and gap cost delta=1.
//
// Parameters:
// X,Y - sequences
// m - length of sequence X
// n - length of sequence Y
//
int Alignment(char* X, Region& rx, char* Y, Region& ry)
{
int m=rx.end-rx.start+1;
int n=ry.end-ry.start+1;
int** S=new int*[m+1]; // temp array to store scores of alignment
for(int i=0;i<m+1;++i)
S[i]=new int[n+1];
memcnt+=sizeof(int)*(m+1)*(n+1);
for(int i=0;i<=m;++i)
S[i][0]=i*GAP_COST;
for(int j=0;j<=n;++j)
S[0][j]=j*GAP_COST;
for(int j=1;j<=n;++j)
{
for(int i=1;i<=m;++i)
{
int alpha;
if(X[i-1]==Y[j-1])
alpha=MATCH_COST;
else
alpha=MISMATCH_COST;
S[i][j]=min(alpha+S[i-1][j-1],GAP_COST+S[i-1][j],GAP_COST+S[i][j-1]);
}
} // end of for
// Trace back
int ix=m;
int iy=n;
int bt_len=m+n;// length of array to record tracing back
int cnt=bt_len-1;
BTNode* bt=new BTNode[bt_len]; // record backtrace path
memcnt+=sizeof(BTNode)*bt_len;
while(ix>=1 && iy>=1)
{
bt[cnt].x=ix-1;
bt[cnt].y=iy-1;
InsertNodetoP(rx.start+ix-1,ry.start+iy-1);
if(S[ix][iy]==S[ix-1][iy-1]+MATCH_COST && X[ix-1]==Y[iy-1]) // match
{
bt[cnt].type='m';
bt[cnt].nt='b';
ix--;
iy--;
} else if(S[ix][iy]==S[ix-1][iy-1]+MISMATCH_COST) { // mismatch
bt[cnt].type='s';
bt[cnt].nt='b';
ix--;
iy--;
} else if(S[ix][iy]==S[ix-1][iy]+GAP_COST) { // gap
bt[cnt].type='g';
bt[cnt].nt='x';
ix--;
} else if(S[ix][iy]==S[ix][iy-1]+GAP_COST) { // gap
bt[cnt].type='g';
bt[cnt].nt='y';
iy--;
}
cnt--;
}
while(iy>0)
{
bt[cnt].x=ix-1;
bt[cnt].y=iy-1;
InsertNodetoP(rx.start+ix-1,ry.start+iy-1);
if(S[ix][iy]==S[ix][iy-1]+GAP_COST) { // gap
bt[cnt].type='g';
bt[cnt].nt='y';
iy--;
}
cnt--;
}
while(ix>0)
{
bt[cnt].x=ix-1;
bt[cnt].y=iy-1;
InsertNodetoP(rx.start+ix-1,ry.start+iy-1);
if(S[ix][iy]==S[ix-1][iy]+GAP_COST) { // gap
bt[cnt].type='g';
bt[cnt].nt='x';
ix--;
}
cnt--;
}
/* // Print the alignment
cnt++;
// TEST ONLY
for(int i=cnt;i<bt_len;++i)
{
cout<<"("<<bt[i].x<<","<<bt[i].y<<") ";
}
cout<<endl;
// Print sequence X
for(int i=cnt;i<bt_len;++i)
{
if(bt[i].nt=='x'||bt[i].nt=='b')
{
if(bt[i].x<0)
cout<<'-';
else
cout<<X[bt[i].x];
} else {
cout<<'-';
}
}
cout<<endl;
// print middle line
for(int i=cnt;i<bt_len;++i)
{
if(bt[i].type=='m')
cout<<'|';
else
cout<<' ';
}
cout<<endl;
// print sequence Y
for(int i=cnt;i<bt_len;++i)
{
if(bt[i].nt=='y'||bt[i].nt=='b')
{
if(bt[i].y<0)
cout<<'-';
else
cout<<Y[bt[i].y];
} else {
cout<<'-';
}
}
cout<<endl; */
delete [] bt;
int score=S[m][n];
for(int i=0;i<m+1;++i)
delete [] S[i];
delete [] S;
return score;
}
//
// Space efficient alignment, which caculate the length of the
// shortest path from (0,0) to (i,j) and only can returns the optimal value.
//
int SpaceEfficientAlignment(int** S, char* X, int m, char* Y, int n)
{
for(int i=0;i<=m;++i)
{
S[i][0]=i*GAP_COST;
}
for(int j=1;j<=n;++j)
{
S[0][1]=j*GAP_COST;
for(int i=1;i<=m;++i)
{
int alpha;
if(X[i-1]==Y[j-1])
alpha=MATCH_COST;
else
alpha=MISMATCH_COST;
S[i][1]=min(alpha+S[i-1][0],GAP_COST+S[i-1][1],GAP_COST+S[i][0]);
}
// move column 1 of S to column 0 to make room for next iteration
for(int i=0;i<=m;++i)
S[i][0]=S[i][1];
}
return S[m][1];
}
//
// Backward space efficient alignment, which caculate the length of the
// shortest path from (i,j) to (m,n) and only can returns the optimal value.
//
int BackwardSpaceEfficientAlignment(int** S, char* X, int m, char* Y, int n)
{
for(int i=m;i>=0;--i)
{
S[i][1]=(m-i)*GAP_COST;
}
for(int j=n-1;j>=0;--j)
{
S[m][0]=(n-j)*GAP_COST;
for(int i=m-1;i>=0;--i)
{
int alpha;
//if(X[m-i-1]==Y[j])
if(X[i]==Y[j])
alpha=MATCH_COST;
else
alpha=MISMATCH_COST;
S[i][0]=min(alpha+S[i+1][1],GAP_COST+S[i+1][0],GAP_COST+S[i][1]);
}
// move column 0 of S to column 1 to make room for next iteration
for(int i=0;i<=m;++i)
S[i][1]=S[i][0];
}
return S[0][0];
}
//
// Find out the q that minimize the score f(q,n/2)+g(q,n/2)
//
// Inputs:
// sf - score vector of forward space efficient alignment
// sb - score vector of backward space efficient alignment
// m - length of the score vector
//
int FindMinScore(int** sf,int** sb,int m)
{
int min=sf[1][1]+sb[1][0];
int ret=1;
for(int q=2;q<=m;++q)
{
if(min>sf[q][1]+sb[q][0])
{
min=sf[q][1]+sb[q][0];
ret=q-1; // ret should be the position in the sequence, so minus 1
}
}
return ret;
}
//
// Sequence alignemnt using divide and conquer
//
void DivideConquerAlignment(char* X, Region& rx, char* Y, Region& ry)
{
int m=rx.end-rx.start+1;
int n=ry.end-ry.start+1;
if(m<=2 || n<=2)
{
Alignment(&X[rx.start],rx,&Y[ry.start],ry);
// cout<<"Call Alignment"<<endl; // TEST ONLY
return;
}
int** score_forward=new int*[m+1]; // (m+1)x2 matrix to store scores of alignment
int** score_backward=new int*[m+1]; // (m+1)x2 matrix to store scores of alignment
for(int i=0;i<m+1;++i)
{
score_forward[i]=new int[2];
score_backward[i]=new int[2];
}
memcnt+=sizeof(int)*2*(m+1)*2;
// Get the middle points in path
Region tempry;
tempry.start=ry.start;
tempry.end=(ry.start+ry.end)/2;
SpaceEfficientAlignment(score_forward,&X[rx.start],m,&Y[tempry.start],tempry.end-tempry.start+1);
tempry.start=(ry.start+ry.end)/2+1;
tempry.end=ry.end;
BackwardSpaceEfficientAlignment(score_backward,&X[rx.start],m,&Y[tempry.start],tempry.end-tempry.start+1);
int q=FindMinScore(score_forward,score_backward,m)+rx.start;
// cout<<"q="<<q<<", n/2="<<(ry.start+ry.end)/2<<endl; // TEST ONLY
InsertNodetoP(q,(ry.start+ry.end)/2);
// Divide and Conquer
Region temprx;
temprx.start=rx.start;
temprx.end=q;
tempry.start=ry.start;
tempry.end=(ry.start+ry.end)/2;
if(temprx.start<=temprx.end && tempry.start<=tempry.end)
DivideConquerAlignment(X,temprx,Y,tempry);
temprx.start=q+1;
temprx.end=rx.end;
tempry.start=(ry.start+ry.end)/2+1;
tempry.end=ry.end;
if(temprx.start<=temprx.end && tempry.start<=tempry.end)
DivideConquerAlignment(X,temprx,Y,tempry);
for(int i=0;i<m+1;++i)
{
delete [] score_forward[i];
delete [] score_backward[i];
}
delete [] score_forward;
delete [] score_backward;
}
//
// Backtrace and print the alignment
//
void Backtrace(char* X,char* Y)
{
Merge_sort(P,0,pidx-1); // Sort nodes in path according to element x
// Adjust all nodes in path by element y
int prev_x=-1,prev_y=-1;
BTNode temp;
for(int i=0;i<pidx;++i)
{
if(prev_y>P[i].y)
{
temp=P[i-1];
P[i-1]=P[i];
P[i]=temp;
}
prev_y=P[i].y;
}
// for(int i=0;i<pidx;++i)
// cout<<"("<<P[i].x<<","<<P[i].y<<")"<<" "; //TEST ONLY
// cout<<endl; // TEST ONLY
// Print sequence X
prev_x=-1;
for(int i=0;i<pidx;++i)
{
if(P[i].x<0 || P[i].x==prev_x)
cout<<'-';
else
cout<<X[P[i].x];
prev_x=P[i].x;
}
cout<<endl;
// Print middle line
prev_x=-1;
prev_y=-1;
for(int i=0;i<pidx;++i)
{
if(X[P[i].x]==Y[P[i].y] && prev_x!=P[i].x && prev_y!=P[i].y)
cout<<'|';
else
cout<<' ';
prev_x=P[i].x;
prev_y=P[i].y;
}
cout<<endl;
// Print sequence Y
prev_y=-1;
for(int i=0;i<pidx;++i)
{
if(P[i].y<0 || P[i].y==prev_y)
cout<<'-';
else
cout<<Y[P[i].y];
prev_y=P[i].y;
}
cout<<endl;
}
//
// MAIN START HERE
//
int main(int argc, char** argv)
{
// initialize random seed
srand(time(NULL));
char* output=NULL; // output file
int seqlen=0; // sequence length
int c;
while ((c = getopt (argc, argv, "n:o:")) != -1)
{
switch (c)
{
case 'n':
seqlen = atoi(optarg);
break;
case 'o':
output = optarg;
break;
case '?':
if (optopt == 'n')
fprintf (stderr, "Option -%c requires an argument.\n\n", optopt);
else if (isprint (optopt))
fprintf (stderr, "Unknown option `-%c'.\n\n", optopt);
else
fprintf (stderr,
"Unknown option character `\\x%x'.\n\n",
optopt);
return 1;
default:
abort ();
}
}
int m=seqlen;
int n=seqlen-rand()%(seqlen/5);
char* ref=new char[m];
char* seq=new char[n];
memcnt+=sizeof(char)*(m+n);
// generate random sequences
ref=GenRandSeq(ref,m);
seq=GenRandSeq(seq,n);
P=new BTNode[m+n];
memcnt+=sizeof(BTNode)*(m+n);
cout<<"seq 1:"<<ref<<endl; // TEST ONLY
cout<<"seq 2:"<<seq<<endl; // TEST ONLY
Region rx, ry;
rx.start=0;
rx.end=m-1;
ry.start=0;
ry.end=n-1;
// Alignment(ref,rx,seq,ry); // TEST ONLY
DivideConquerAlignment(ref,rx,seq,ry);
Backtrace(ref,seq);
delete [] ref;
delete [] seq;
delete [] P;
// Record the space costs
fstream fs;
fs.open("memory_cost", fstream::out|fstream::app);
fs<<m<<"bp: "<<memcnt<<endl;
fs.close();
return 0;
}