最长公共子序列(nlogn)

最新推荐文章于 2025-11-08 15:00:00 发布

原创最新推荐文章于 2025-11-08 15:00:00 发布 · 1.6w 阅读

12 ·

CC 4.0 BY-SA版权

算法导论相关专栏收录该内容

41 篇文章

订阅专栏

介绍了一种改进的最长公共子序列（LCS）算法，该算法的时间复杂度为O(nlogn)，通过构建新序列并求其最长严格递增子序列（LIS）来获得LCS，实现在特定情况下比传统DP算法更快。

最长公共子序列（LCS）最常见的算法是时间复杂度为O(n^2)的动态规划(DP)算法，但在James W. Hunt和Thomas G. Szymansky 的论文"A Fast Algorithm for Computing Longest Common Subsequence"中，给出了O(nlogn)下限的一种算法。

定理：设序列A长度为n，{A(i)}，序列B长度为m，{B(i)}，考虑A中所有元素在B中的序号，即A某元素在B的序号为{P_k1,P_k2,..}，将这些序号按照降序排列，然后按照A中的顺序得到一个新序列，此新序列的最长严格递增子序列即对应为A、B的最长公共子序列。

举例来说，A={a,b,c,d,b}，B={b,c,a,b}，则a对应在B的序号为2，b对应序号为{4,0}，c对应序号为1，d对应为空集，生成的新序列为{2,4, 0, 1, 4, 0}，其最长严格递增子序列为{0,1,4}，对应的公共子序列为{b, c, b}

原论文的证明过程较复杂，其实可以简单的通过一一对应来证明。即证明A、B的一个公共子序列和新序列的一个严格递增子序列一一对应。

(1) A、B的一个公共子序列对应新序列的一个严格递增子序列

假设A、B的某一个公共子序列长度为k，则其公共子序列在A和B中可以写为

{A_i1,A_i2, ..., A_ik}

{B_j1,B_j2, ..., B_jk}

如此有A_i1 = A_j1,A_i2 = A_j2, ...., A_ik = A_jk, 考虑元素B_j1在B中的序号P(B_j1)，则有

P(B_j1)< P(B_j2) < ... < P(B_jk)

注意此严格递增子序列属于新序列的一个子序列，因此得证

(2) 新序列的一个严格递增子序列对应A、B的一个公共子序列

设新序列的一个严格递增子序列{P₁,P₂, ..., P_k}，任意两个相信的P不可能属于A中同一个元素，因为A中某元素在B中的序号按照降序排列，但此序列为严格递增序列，矛盾。所以每个P均对应于A中不同位置的元素，设为{A_i1, A_i2, ..., A_ik}。

因为P是严格递增序列，则每个P也对应B中唯一的一个元素，假设为{B_j1,B_j2, ..., B_jk}，由P的定义可知A_i1= A_j1, A_i2 = A_j2, ...., A_ik = A_jk，因此得证。

实现上比较复杂，有以下几个步骤：

(1) 对序列B排序

(2) 计算A中每个元素在B中的序号，并构成新序列

(3) 使用LIS的方法计算最长严格递增子序列

(4) 获取最长公共子序列

性能分析：

(1) 排序复杂度为nlogn

(2) 获取一个元素在B中的序号的复杂度，最小为logn，最大为n，获取所有元素的复杂度为 nlogn === n*n

(3) LIS 复杂度为nlogn

因此总体复杂度在nlogn 到 n*n logn之间，但如果(2) 步骤中A中元素在B中的序号对数很少时，性能相当优越，在实际测试时，string 中均为小写字母，长度为10000的情况下，这种方法比普通的LCS快一倍以上；如果string 中的字符扩展成char，即0-255，则这种方法比普通的LCS快至少一个数量级。以下为代码实现，可以参考：

/* ================================================================== */
/* NlogN for LCS, using LIS */
/* ================================================================== */
struct lcs_sort {
    int val;
    int pos;
};

struct lcs_lis_pos {
    int pos_val;
    int pos_1;
};

/* first,  sort string2 */
static int qsort_func(const void *s1, const void *s2)
{
    struct lcs_sort *ls1 = (struct lcs_sort *)s1, *ls2 = (struct lcs_sort *)s2;

    if (ls1->val > ls2->val) 
        return 1;
    else if (ls1->val < ls2->val)
        return -1;
    else {
        if (ls1->pos > ls2->pos)
            return -1;
        else
            return 1;
    }
}

static void sort_lcs_string(char *string2, int n2, struct lcs_sort **ls_pp)
{
    int i = 0;
    struct lcs_sort *ls = NULL;
    
    ls = malloc(sizeof(struct lcs_sort) * n2);
    for (i = 0;i < n2;i++) {
        ls[i].val = string2[i];
        ls[i].pos = i;
    }

    qsort(ls, n2, sizeof(struct lcs_sort), qsort_func);
    *ls_pp = ls;
}



/* second, find match list for char in string1 and construct a new sequence */
static int find_match_list(char ch, struct lcs_sort *ls, int l, int r, int *start)
{
    int m = 0, len, len2;
    int start1 = 0, start2 = 0;

    if (l > r)
        return 0;
    else if (l == r) {
        if (ch == ls[l].val) {
            *start = r;
            return 1;
        }
        return 0;
    }

    m = (l + r)>>1;
    if (ls[m].val < ch)
        len = find_match_list(ch, ls, m + 1, r, &start1);
    else if (ls[m].val > ch)
        len = find_match_list(ch, ls, l, m - 1, &start1);
    else {
        len = find_match_list(ch, ls, l, m - 1, &start1);
        len2 = find_match_list(ch, ls, m + 1, r, &start2) + 1;

        if (len == 0) 
            start1 = m;
        len += len2;
    }
    *start = start1;

    return len;
}

static int match_list_lcs(char *string1, int n1, struct lcs_sort *ls, int n2, struct lcs_lis_pos **lpos_pp)
{
    int i = 0, j = 0, k = 0, len = 0, start = 0;
    struct lcs_lis_pos *lpos = NULL;
    lpos = malloc(sizeof(struct lcs_lis_pos) * n1 * n2);

    for (i = 0;i < n1;i++) {
        len = find_match_list(string1[i], ls, 0, n2 - 1, &start);
        for (k = 0;k < len;k++) {
            lpos[j].pos_val = ls[start + k].pos;
            lpos[j++].pos_1 = i;
        }
    }
    
    *lpos_pp = lpos;
    return j;
}

/* third, find LIS for this new sequence */
static inline int find_pos(int *B, int len, int value)
{
    int left = 0, right = len - 1, middle = 0;

    while (left <= right) {
        /* must finding strictly increasing sub sequence */
        middle = (left + right)>>1;
        if (B[middle] < value)
            left = middle + 1;
        else
            right = middle - 1;
    }
    
    return left;
}

static int lis_nlogn(struct lcs_lis_pos *p, int len, int *inc_seq)
{
    int i = 0, pos = 0, curr_len = 0;
    int *L = NULL, *prev = NULL, *M = NULL;

    if (len <= 0)
        return 0;

    L = malloc(len * sizeof(int));
    M = malloc(len * sizeof(int));    
    prev = malloc(len * sizeof(int));

    L[0] = p[0].pos_val;
    M[0] = 0;
    prev[0] = -1;               /* the prev of the p[0] is NULL */ 
    curr_len = 1;

    /* Caculate prev and M */
    for (i = 1;i < len;i++) {
        pos = find_pos(L, curr_len, p[i].pos_val);
        L[pos] = p[i].pos_val;
        M[pos] = i;
        if (pos > 0)
            prev[i] = M[pos - 1];
        else
            prev[i] = -1;
        if (pos + 1 > curr_len)
            curr_len++;
    }

    /* Output increasing sequence */
    pos = M[curr_len - 1];
    for (i = curr_len - 1;i >= 0 && pos != -1;i--) {
        inc_seq[i] = p[pos].pos_1;
        pos = prev[pos];
    }
    return curr_len;
}


static int calc_lcs_using_lis(char *string1, int n1, char *string2, int n2)
{
    int *inc_seq = NULL;
    int list_len = 0, inc_len = 0, len = (n1 < n2) ? n1 : n2;
    struct lcs_sort *ls = NULL;
    struct lcs_lis_pos *lpos = NULL;

    inc_seq = (int *)malloc(sizeof(int) * len);

    /* first,  sort string2 */
    sort_lcs_string(string2, n2, &ls);

    /* second, find match list for char in string1 and construct a new sequence */
    list_len = match_list_lcs(string1, n1, ls, n2, &lpos);

    /* third, find LIS for this new sequence */
    inc_len = lis_nlogn(lpos, list_len, inc_seq);

    /* end, get LCS */
#if 0
    int i = 0;
    printf("LCS is: ");
    for (i = 0;i < inc_len;i++) {
        inc_seq[i] = string1[inc_seq[i]];
        printf("%c ", inc_seq[i]);
    }
    printf("\r\n");
#endif

    free(ls);
    free(lpos);    
    return inc_len;
}