POJ - 3693 Maximum repetition substring （后缀数组+ST表）

DrGilbert

于 2020-10-20 21:19:05 发布

阅读量147

点赞数

分类专栏：字符串文章标签： ACM 字符串后缀数组

本文链接：https://blog.csdn.net/oampamp1/article/details/109189569

版权

字符串专栏收录该内容

3 篇文章 0 订阅

订阅专栏

原题题面

The repetition number of a string is defined as the maximum number R such that the string can be partitioned into R same consecutive substrings. For example, the repetition number of “ababab” is 3 and “ababa” is 1.

Given a string containing lowercase letters, you are to find a substring of it with maximum repetition number.

输入格式

The input consists of multiple test cases. Each test case contains exactly one line, which
gives a non-empty string consisting of lowercase letters. The length of the string will not be greater than 100,000.

The last test case is followed by a line containing a ‘#’.

输出格式

For each test case, print a line containing the test case number( beginning with 1) followed by the substring of maximum repetition number. If there are multiple substrings of maximum repetition number, print the lexicographically smallest one.

输入样例

ccabababc
daabbccaa
#

输出样例

Case 1: ababab
Case 2: aa

题面分析

对于一个字符串，找出其连续的重复次数最多的子串，如果有多个请输出字典序最小的那个。
首先关于后缀数组的所有理论请看这篇论文->国家集训队2009论文集——后缀数组
我们暴力枚举可能的长度L，然后再s[1],s[1+L],s[1+2L]… 的里面匹配他们的最长公共前缀和最长公共后缀，记为K。那么重复出现的次数就是K/L+1，我们找到最大的保存下来，并同时记录当时对应的L。
但这样找到的结果可能有多个，因此我们通过枚举**SA[i]**来匹配，这样找到的第一个结果一定是字典序最小的。
其他的细节看代码和论文即可。

AC代码(516ms)

#include <bits/stdc++.h>
using namespace std;
const int maxn=1e5;
#define ll long long
namespace ST
{
    const int logn=30;
    int f[maxn+10][logn], Log[maxn+10];
    void init_log()
    {
        Log[0]=-1;
        Log[1]=0;
        Log[2]=1;
        for(int i=3; i<maxn; i++)
        {
            Log[i]=Log[i/2]+1;
        }
    }
    void build(int n, int *a)
    {
        for(int i=1; i<=n; i++)
        {
            f[i][0]=a[i];
        }
        init_log();
        for(int j=1; j<=logn; j++)
        {
            for(int i=1; i+(1<<j)-1<=n; i++)
            {
                f[i][j]=min(f[i][j-1], f[i+(1<<(j-1))][j-1]);
            }
        }
    }
    int query(int l, int r)
    {
        if (l>r)
            swap(l, r);
        l++;
        int s=Log[r-l+1];
        return min(f[l][s], f[r-(1<<s)+1][s]);
    }
}
namespace SA
{
    int SA[maxn+10];//所有后缀排序后第i小的后缀的起始位置的下标;
    int Rank[maxn+10];//Rank[i] 第i个后缀的排名,SA[Rank[i]]=Rank[SA[i]]=i
    int Height[maxn+10];
    //Height[i]=排名为i的后缀与排名为(i-1)的后缀的最长公共前缀
    //Height[i]>=Height[i-1]+1
    //LCP(i,k)=min(LCP(j,j-1)) (1<i<=j<=k<=n)
    //不同子串的个数:n(n+1)/2-(Height[2]+Height[3]+...+Height[n])
    int tax[maxn+10], tp[maxn+10];
    //tax[i] 计数排序辅助数组;
    //tp[i] rk的辅助数组(计数排序中的第二关键字),与SA意义一样。
    int len;
    char s[maxn+10];
    int m;//m当前离散后的排名种类数
    void RSort()
    {
        //rk第一关键字,tp第二关键字
        for(int i=0; i<=m; i++) tax[i]=0;
        for(int i=1; i<=len; i++) tax[Rank[tp[i]]]++;
        for(int i=1; i<=m; i++) tax[i]+=tax[i-1];
        for(int i=len; i>=1; i--) SA[tax[Rank[tp[i]]]--]=tp[i];
        //确保满足第一关键字的同时，再满足第二关键字的要求
    }//计数排序,把新的二元组排序
    int cmp(int *f, int x, int y, int w)
    {
        return f[x]==f[y] && f[x+w]==f[y+w];
    }
    //通过二元组两个下标的比较，确定两个子串是否相同
    void GetSA()
    {
        for(int i=1; i<=len; i++)
        {
            Rank[i]=s[i], tp[i]=i;
        }
        RSort();
        for(int w=1, p=1, i; p<len; w+=w, m=p)
        {
            //把子串长度翻倍,更新rk
            //w 当前一个子串的长度;
            //当前的tp(第二关键字)可直接由上一次的SA的得到
            for(p=0, i=len-w+1; i<=len; i++)
                tp[++p]=i; //长度越界,第二关键字为0
            for(i=1; i<=len; i++)
                if (SA[i]>w)
                    tp[++p]=SA[i]-w;
            //更新SA值,并用tp暂时存下上一轮的rk(用于cmp比较)
            RSort(), swap(Rank, tp), Rank[SA[1]]=p=1;
            //用已经完成的SA来更新与它互逆的rk,并离散rk
            for(i=2; i<=len; i++)
                Rank[SA[i]]=cmp(tp, SA[i], SA[i-1], w) ? p : ++p;
        }
        //离散：把相等的字符串的rk设为相同。
    }
    void GetHeight()
    {
        //LCP
        int j, k=0;
        for(int i=1; i<=len; Height[Rank[i++]]=k)
            for(k=k ? k-1 : k, j=SA[Rank[i]-1]; s[i+k]==s[j+k]; ++k);
        //这个知道原理后就比较好理解程序
    }
    void PrintSA()
    {
        for(int i=1; i<=len; i++)
        {
            printf("%d ", SA[i]);
        }
        printf("\n");
    }
    void PrintHeight()
    {
        for(int i=1; i<=len; i++)
        {
            printf("%d ", Height[i]);
        }
        printf("\n");
    }
    void PrintRank()
    {
        for(int i=1; i<=len; i++)
        {
            printf("%d ", Rank[i]);
        }
        printf("\n");
    }
}
bool check(int mid, int m)
{
    int minx=SA::SA[1];
    int cnt=1;
    for(int i=1; i<=SA::len; i++)
    {
        if (SA::Height[i]<mid)
        {
            cnt=1;
            minx=SA::SA[i];
        }
        else
        {
            cnt++;
            minx=min(minx, SA::SA[i]);
            if (cnt>=m)
                return 1;
        }
    }
    return 0;
}
char a[maxn+10];
int answer[maxn+10];//存放重复的基本子串单元的长度
void solve()
{
    int tot=0;
    while(~scanf("%s", a+1) && a[1]!='#')
    {
        int len=strlen(a+1);
        for(int i=1; i<=len; i++)
        {
            SA::s[i]=a[i];
        }
        SA::s[len+1]=0;
        SA::len=len;
        SA::m=200;
        SA::GetSA();
        SA::GetHeight();
//        SA::PrintHeight();
        ST::build(len, SA::Height);
        int mx=-1, cnt=0;
        //mx是重复次数
        for(int l=1; l<=len; l++)
        {
            for(int pos=1; pos+l<=len; pos+=l)
            {
                int LCP=ST::query(SA::Rank[pos], SA::Rank[pos+l]);
//                printf("LCP=%d\n",LCP);
                int res=LCP/l+1;//重复次数
                int k=pos-(l-LCP%l);//为了凑整l的整数倍个，我们假设前面缺少l-LCP%i个,这是对应的下标
                if (k>=1 && LCP%l && ST::query(SA::Rank[k], SA::Rank[k+l]))
                    //k的位置可以和k+l匹配到大于supplement的，以为着这个部分可以和pos的位置合起来
                {
                    res++;
                }
                if (res>mx)
                {
                    mx=res;
                    cnt=0;
                    answer[++cnt]=l;
                }
                else if (res==mx)
                {
                    answer[++cnt]=l;
                }
            }
        }
        int anslen=0, anspos=1;
//        for(int i=1;i<=cnt;i++)
//        {
//            printf("%d ",answer[i]);
//        }
        for(int i=1; i<=len && !anslen; i++)
        //根据字典序查找最小的满足解
        {
            for(int j=1; j<=cnt; j++)
            {
                if (ST::query(SA::Rank[SA::SA[i]], SA::Rank[SA::SA[i]+answer[j]])>=(mx-1)*answer[j])
                {
                    anslen=answer[j];
                    anspos=SA::SA[i];
                    break;
                }
            }
        }
//        printf("pos=%d len=%d mx=%d\n", anspos, anslen, mx);
        printf("Case %d: ", ++tot);
        for(int i=anspos; i<=anspos+anslen*mx-1; i++)
        {
            printf("%c", SA::s[i]);
        }
        printf("\n");
    }
}
int main()
{
//    ios_base::sync_with_stdio(false);
//    cin.tie(0);
//    cout.tie(0);
#ifdef ACM_LOCAL
    freopen("in.txt", "r", stdin);
    freopen("out.txt", "w", stdout);
    long long test_index_for_debug=1;
    char acm_local_for_debug;
    while(cin>>acm_local_for_debug)
    {
        cin.putback(acm_local_for_debug);
        if (test_index_for_debug>100)
        {
            throw runtime_error("Check the stdin!!!");
        }
        auto start_clock_for_debug=clock();
        solve();
        auto end_clock_for_debug=clock();
        cout<<"\nTest "<<test_index_for_debug<<" successful"<<endl;
        cerr<<"Test "<<test_index_for_debug++<<" Run Time: "
            <<double(end_clock_for_debug-start_clock_for_debug)/CLOCKS_PER_SEC<<"s"<<endl;
        cout<<"--------------------------------------------------"<<endl;
    }
#else
    solve();
#endif
    return 0;
}