ACM-最长公共子序列

最新推荐文章于 2020-05-05 18:39:22 发布

潜水的疯

最新推荐文章于 2020-05-05 18:39:22 发布

阅读量1.8k

点赞数 2

文章标签： acm

本文链接：https://blog.csdn.net/u011787119/article/details/44789253

版权

最长公共子序列（The longest common subsequence），即LCS，就是说对于给定的两个字符串，它们之间的相同序列的最大长度是多少，注意序列的意思是不一定连续，但是不能无序，比如字符串ABCBDAB和BDCABA，它们之间的LCS就是BDAB、BCAB、BCBA，它们的长度都是4。

那如何求解两个字符串之间的LCS，一般可以考虑两种解法，即穷举法和动态规划法。

1、穷举法

既然是要判断子序列是否相等，那自然想到的就是吧所有子序列都构造出来，然后一一判断即可。然后分析一下复杂度，对于一个字符串，要枚举其所有的子序列，可以这么考虑，对于其每一位上的字符，无非两种情况，选或不选，那么假如字符串长度为n，那么子序列的个数，也就是枚举的复杂度就是2^n。同样对于第二个字符串，枚举其所有子序列的复杂度也是2^n。再加上最后比较两个子序列的复杂度，那么总的时间复杂度也就是O(2^n*2^n*n)。这是指数级别的复杂度，增长速度非常快。举个例子来说明其效率问题，比如对于下面的数据，用下面的穷举代码进行求解，可以发现最后一个例子已经运行不出来了，可以大概算下运行时间，一般普通计算机每秒的操作次数为10^7，那么对于最后一个例子来说，运行时间大概是2^100*2^100*100/10^7=10^55秒，化成年为单位就可以看出有多么大了：10^55/60/60/24/365=10^47年，是个天文数字。可见穷举法的效率是很低的，已经完全不可行。

acbd
abcd
ABCBDAB
BDCABA
0123456789
0123456789
0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789

#include <cstdio>
#include <cstring>
#include <algorithm>

const int MAXN = 1e3 + 5;
char text[MAXN], temp[MAXN];

// calculate the LCS of test and temp by enume
int FlagA[MAXN], FlagB[MAXN];
int LCSLen, LenA, LenB;

void LCS_EnumeB(int len1, int len2, int deep)
{
    if(len2 > len1) return;
    if(deep == LenB+1)
    {
        if(len1 != len2) return;
        if(len1 < LCSLen) return;
        int i=1, posA=1, posB=1;
        for(; i<=len1; ++i)
        {
            while(FlagA[posA] == 0) ++posA;
            while(FlagB[posB] == 0) ++posB;
            if(text[posA] != temp[posB]) break;
            ++posA;
            ++posB;
        }
        if(i > len1)
        {
            LCSLen = len1;
        }
        return;
    }
    FlagB[deep] = 1;
    LCS_EnumeB(len1, len2+1, deep+1);
    FlagB[deep] = 0;
    LCS_EnumeB(len1, len2, deep+1);
}

void LCS_EnumeA(int len1, int deep)
{
    if(deep == LenA+1)
    {
        if(len1 == 0) return;
        memset(FlagB, 0, sizeof(FlagB));
        LCS_EnumeB(len1, 0, 1);
        return;
    }
    FlagA[deep] = 1;
    LCS_EnumeA(len1+1, deep+1);
    FlagA[deep] = 0;
    LCS_EnumeA(len1, deep+1);
}

int main()
{//freopen("sample.txt", "r", stdin);
    while(~scanf("%s%s", text+1, temp+1))
    {
        LCSLen = 0;
        LenA = strlen(text+1);
        LenB = strlen(temp+1);
        memset(FlagA, 0, sizeof(FlagA));
        LCS_EnumeA(0, 1);
        printf("the length of lcs of %s and %s is %d\n", text+1, temp+1, LCSLen);
    }
    return 0;
}

2、动态规划法

可以明显的发现，穷举法一直在重复计算子序列，这是重叠子问题，动态规划正是为了优化这类问题的算法。先看下面的转移方程。

c[i,j]代表的是第一个字符串长度为i的前缀与第二个字符串长度为j的前缀的LCS，那么可以判断上面的转移方程的正确性无疑是对的。唯一需要注意的是第三种转移方式，如果要保证状态转移成功，则要求在计算c[i,j]的时候，c[i，j-1]和c[i-1,j]都是已经计算出来了的，其实这个条件可以有递推顺序来保证，即假如两个字符串的长度分别是n和m的话，那么可以从1-n枚举i，再从1-m枚举j。最后分析下时间复杂度，很明显的两层for循环，O(n*m)。可以看到，对于同样的测试例子，这个算法都能很快的求解出来。

最后再说一下如何在上述dp算法求解完后再构造出具体的最长公共子序列。以字符串ABCBDAB和BDCABA为例，上述算法完成后，转移状态，也就是前面的c矩阵的情况如下。

图中的箭头就是dp算法进行中状态的转移方向，那么可以将这些方向记录下来，当算法完成后，由最后一个状态，也就是右下角的c[n,m]往回走，这样就可以构造出所有的最长公共子序列了。

#include <cstdio>
#include <cstring>
#include <algorithm>

const int MAXN = 1e3 + 5;
char text[MAXN], temp[MAXN];
// C[i][j] represent the LCS of prefix of text[i] and temp[j]
int C[MAXN][MAXN];
// the direction of state transition
int dir[MAXN][MAXN];

// calculate the LCS of test and temp by dp
int LCS_DP(char *text, char *temp)
{
    int n = strlen(text+1);
    int m = strlen(temp+1);
    memset(C, 0, sizeof(C));
    memset(dir, 0, sizeof(C));

    // initialize the border
    for(int i=0; i<=n; ++i) C[i][0] = 0;
    for(int i=0; i<=m; ++i) C[0][i] = 0;

    // the state transition
    for(int i=1; i<=n; ++i) for(int j=1; j<=m; ++j)
    {
        if(text[i] == temp[j])
        {
            C[i][j] = C[i-1][j-1] + 1;
            // left-up
            dir[i][j] = 1;
        }
        else if(C[i][j-1] > C[i-1][j])
        {
            C[i][j] = C[i][j-1];
            // left
            dir[i][j] = 2;
        }
        else if(C[i][j-1] < C[i-1][j])
        {
            C[i][j] = C[i-1][j];
            // up
            dir[i][j] = 3;
        }
        else
        {
            // C[i][j-1] == C[i-1][j]
            C[i][j] = C[i][j-1];
            // left or up
            dir[i][j] = 4;
        }
    }
    return C[n][m];
}

// reverse backtracking to find the lcs
void FindLCS(int i, int j, char *lcs, int curlen)
{
    if(i==0 || j==0)
    {
        for(int k=curlen-1; k>=0; --k) printf("%c", lcs[k]);
        putchar('\n');
        return;
    }
    if(dir[i][j] == 1)
    {
        // text[i] == temp[j]
        lcs[curlen] = text[i];
        FindLCS(i-1, j-1, lcs, curlen+1);
    }
    else if(dir[i][j] == 2)
    {
        // from left
        FindLCS(i, j-1, lcs, curlen);
    }
    else if(dir[i][j] == 3)
    {
        // from up
        FindLCS(i-1, j, lcs, curlen);
    }
    else
    {
        // from left or up
        FindLCS(i, j-1, lcs, curlen);
        FindLCS(i-1, j, lcs, curlen);
    }
}


int main()
{//freopen("sample.txt", "r", stdin);
    while(~scanf("%s%s", text+1, temp+1))
    {
        int lenlcs = LCS_DP(text, temp);
        int n = strlen(text+1);
        int m = strlen(temp+1);
        puts("the matrix of state transition is£º");
        for(int i=1; i<=n; ++i) for(int j=1; j<=m; ++j)
        {
             printf("%d%c", C[i][j], j==m?'\n':' ');
        }
        printf("the length of lcs of %s and %s is %d\n", text+1, temp+1, lenlcs);

        printf("the lcs of %s and %s is£º\n", text+1, temp+1);
        char lcs[MAXN];
        FindLCS(n, m, lcs, 0);

    }
    return 0;
}