// Based on Hirschberg's algorithm to find the best alignment for two strings.
// This algorithm is proved to run in O(mn) time and O(m + n) space
// reference:
// https://en.wikipedia.org/wiki/Hirschberg%27s_algorithm
// http://www.cs.princeton.edu/~wayne/kleinberg-tardos/pdf/06DynamicProgrammingII.pdf
// http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Dynamic/Hirsch/
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <Windows.h>
using namespace std;
vector<int> score(string X, string Y, int alpha, int theta, bool rev)
{
int m = static_cast<int>(X.length());
int n = static_cast<int>(Y.length());
vector<int> ret(n + 1, 0);
for (int j = 0; j <= n; ret[j] = j * theta, ++j);
if (rev)
{
reverse(X.begin(), X.end());
reverse(Y.begin(), Y.end());
}
for (int i = 1; i <= m; ++i)
{
int prev = ret[0];
ret[0] += theta;
for (int j = 1; j <= n; ++j)
{
int cost = 0;
if (X[i - 1] == Y[j - 1])
cost = prev;
else
cost = prev + alpha;
cost = min(cost, theta + ret[j]);
cost = min(cost, theta + ret[j - 1]);
prev = ret[j];
ret[j] = cost;
}
}
return ret;
}
pair<string, string> NeedlemanWunsch(string X, string Y, int alpha, int theta)
{
int m = X.length();
int n = Y.length();
vector<vector<int>> M(m + 1, vector<int>(n + 1, 0));
for (int i = 0; i <= m; ++i)
M[i][0] = i * theta;
for (int j = 0; j <= n; ++j)
M[0][j] = j * theta;
for (int i = 1; i <= m; ++i)
{
for (int j = 1; j <= n; ++j)
{
int cost = 0; int kind = 1;
if (X[i - 1] == Y[j - 1])
cost = M[i - 1][j - 1];
else
cost = M[i - 1][j - 1] + alpha;
cost = min(cost, theta + M[i - 1][j]);
cost = min(cost, theta + M[i][j - 1]);
M[i][j] = cost;
}
}
int i = m, j = n;
vector<string> align(2);
// backtracking to get the solution
while (i >= 1 || j >= 1)
{
if ((i > 0 && M[i][j] == theta + M[i - 1][j]) || j == 0)
{
align[0].push_back(X[i - 1]);
align[1].push_back('-');
--i;
}
else if ((j > 0 && M[i][j] == theta + M[i][j - 1]) || i == 0)
{
align[0].push_back('-');
align[1].push_back(Y[j - 1]);
--j;
}
else if (M[i][j] == M[i - 1][j - 1] + alpha || M[i][j] == M[i - 1][j - 1])
{
align[0].push_back(X[i - 1]);
align[1].push_back(Y[j - 1]);
--i, --j;
}
}
reverse(align[0].begin(), align[0].end());
reverse(align[1].begin(), align[1].end());
return {align[0], align[1]};
}
int partitionY(vector<int> &scoreL, vector<int> &scoreR)
{
int n = static_cast<int>(scoreL.size());
int minimum = INT_MAX, index = 0;
reverse(scoreR.begin(), scoreR.end());
for (int i = 0; i < n; ++i)
{
if (scoreL[i] + scoreR[i] < minimum)
{
minimum = scoreL[i] + scoreR[i];
index = i;
}
}
return index;
}
pair<string, string> Hirschberg(string X, string Y, int alpha, int theta)
{
int m = static_cast<int>(X.length());
int n = static_cast<int>(Y.length());
string Z, W;
if (m == 0)
{
return{ string(n, '-'), Y };
}
else if (n == 0)
{
return{ X, string(m, '-') };
}
else if (m == 1 || n == 1)
{
return NeedlemanWunsch(X, Y, alpha, theta);
}
else
{
int xmid = m / 2;
string X_l = X.substr(0, xmid);
string X_r = X.substr(xmid, m - xmid);
int ymid = partitionY(score(X_l, Y, alpha, theta, false), score(X_r, Y, alpha, theta, true));
string Y_l = Y.substr(0, ymid);
string Y_r = Y.substr(ymid, n - ymid);
pair<string, string> ret1 = Hirschberg(X_l, Y_l, alpha, theta);
pair<string, string> ret2 = Hirschberg(X_r, Y_r, alpha, theta);
Z = ret1.first + ret2.first;
W = ret1.second + ret2.second;
}
return { Z, W };
}
int main()
{
string X = "CTACCG";
string Y = "TACATG";
string X1 = "AGTACGCA";
string Y1 = "TATGC";
auto ret1 = Hirschberg(X, Y, 1, 2);
cout << "X: " << X << endl;
cout << "Y: " << Y << endl;
cout << "the alignment for X and Y:" << endl;
cout << ret1.first << endl;
cout << ret1.second << endl;
cout << "--------------------------------------------------" << endl;
auto ret2 = Hirschberg(X1, Y1, 1, 2);
cout << "X1: " << X << endl;
cout << "Y1: " << Y << endl;
cout << "the alignment for X1 and Y1:" << endl;
cout << ret2.first << endl;
cout << ret2.second << endl;
system("PAUSE");
return 0;
}
Hirschberg's algorithm to find string alignment
最新推荐文章于 2020-11-28 20:39:27 发布