带通配符*和？的kmp

最新推荐文章于 2019-11-06 09:35:59 发布

tiandyoin

最新推荐文章于 2019-11-06 09:35:59 发布

阅读量1.5k

点赞数 1

Wildcard

TimeLimit: 2000/1000 MS (Java/Others) Memory Limit: 125536/65536 K(Java/Others)
Total Submission(s): 563 Accepted Submission(s):135

Problem Description

When specifying file names (or paths) inDOS, Microsoft Windows and Unix-like operating systems, theasterisk character (“*") substitutes for any zero or morecharacters, and the question mark (“?")substitutes for any one character.
Now give you a text and a pattern, you should judge whether thepattern matches the text or not.

Input

There are several cases. For each case,only two lines. The first line contains a text, which contains onlylower letters. The last line contains a pattern, which consists oflower letters and the two wildcards (“*", "?").
The text is a non-empty string and its size is less than 100,000,so do the pattern.

We ensure the number of “?” and the number of “*” in the patternare both no more than 10.

Output

Output “YES” if the pattern matches thetext, otherwise “NO”.

Sample Input

   abcdefa*c*f 
 

Sample Output

YES

Source

2011 Multi-University Training Contest 7 - Host by ECNU

HDU 3901 Wildcard

题目大意:

给两个长度不超过100000的字符串, 一个是带有通配符?和*的模式串, 问能否匹配. 通配符不超过10个.

这题弄了差不多一天= =···不过还算是有收获吧！

方法是我自己YY出来的，代码老长==···希望能有更好的方法，忘各位神牛指教！！

我先把字符串处理成s1-s2-s3这样，就是两头没有'?'和'*'（其中'*'直接忽视,计算'?'的数目，然后对文本串减去该数目的长度），一以方便下面的处理。

然后分别对子串进行kmp，找出所有匹配。

再然后对'?'两边进行合并，是匹配串都是s1*s2*s3这样，然后就可以进行贪心了。

不过具体的实现我觉得有挺多细节的···具体代码有详细注释：

#include <cstdio>
#include <cstring>

static const int maxn = 100100;

//这些数组下标为0处用来表示数组长度
//前面四个都是用来储存子串的起点or终点的
static int start[25][maxn], end[25][maxn], s[25][maxn], e[25][maxn], temp1[maxn], temp2[maxn];

//front和back表示当前匹配的子串前or后紧接着的'?'个数,然后会加到start or end数组中
int front, back;
int fail[maxn];

void kmp(char* str, char* pat, int num)
{
    int i, j, k;
    memset(fail, -1, sizeof(fail));
    start[num][0] = 0;
    end[num][0] = 0;

    for( i = 1; pat[i]; ++i )
    {
        for( k = fail[i-1]; k >= 0 && pat[i] - pat[k+1]; k = fail[k] );
        if( pat[k+1] == pat[i] )
            fail[i] = k + 1;
    }

    i = j = 0;
    while( str[i] && pat[j] )
    {
        if( str[i] == pat[j] )
        {
            ++i, ++j;
            if( pat[j] == '\0' )
            {
                start[num][++start[num][0]] = i - j - front;
                end[num][++end[num][0]] = i - 1 + back;
                j = fail[j-1] + 1;
            }
        }
        else if( j == 0 ) ++i;
        else j = fail[j-1] + 1;
    }
    front = back = 0;
}

static char str[maxn], p[maxn], t[maxn];
//st表示的是该间断点的状态,1代表'?',0是'*'
static bool st[maxn];

int main()
{
    int i, j, k, ll, rr, mid;
    int len, cnt1, cnt2;
    bool ok;

    while( scanf("%s %s", str, p) != EOF )
    {
        ok = 1;
        front = back = 0;

        //先处理一下p的头尾,并相应的对str做出改变
        //使之变成统一的s1-s2-s3-s4形式,两头没有'?'or'*',方便下面的处理
        len = strlen(p);
        for( i = cnt1 = 0; i < len && (p[i] == '*' || p[i] == '?'); ++i )
            if( p[i] == '?' )
                ++cnt1;
        for( j = len-1, cnt2 = 0; j >= 0 && (p[j] == '*' || p[j] == '?'); --j )
            if( p[j] == '?' )
                ++cnt2;
        if( i > j )
        {
            if( cnt1 <= strlen(str) )    
                printf("YES 0 0\n");
            else    
                printf("NO\n");
            continue;
        }
        p[j+1] = '\0';    
		str[strlen(str)-cnt2] = '\0';
        strcpy(p, p+i);
        strcpy(str, str+cnt1);

        //对每个子串求它的匹配
        len = strlen(p);
        for( i = j = k = 0; i < len; ++i )
        {
            if( p[i] == '*' )
            {
                if( !j )
                    continue;
                t[j] = '\0';
                st[k] = 0;
                kmp(str, t, k++);
                j = 0;
            }
            else if( p[i] == '?' )
            {
                if( !j )
                {
                    //例子: aa*??b
                    //??应该算到b中,就是后一个子串
                    front = 1;
                    while( p[i+1] == '?' )
                    {
                        ++front;    
                        ++i;
                    }
                    continue;
                }
                back = 1;
                while( p[i+1] == '?' )
                {
                    //例子: aa???
                    ++back;
                    ++i;
                }
                //例子: aa???*b 与 aa???b
                st[k] = p[i+1] == '*' ? 0 : 1;
                t[j] = '\0';
                kmp(str, t, k++);
                j = 0;
            }
            else
                t[j++] = p[i];
        }
        t[j] = '\0';
        kmp(str, t, k);

        //检查每个子串是否都出现了
        for( i = 0; ok && i <= k; ++i )    
			if( !start[i][0] )
				ok = 0;

        //对状态为1,就是'?'两边的子串合并,使p串成为统一由'*'分隔的，以进行贪心
        if( ok )
        {
            len = 0;
            for( i = 0; ok && i < k; ++i )
            {
                //临时数组,储存合并后的始点和终点
                temp1[0] = temp2[0] = 0;
                if( st[i] )
                {
                    //枚举左边的子串的始点,然后对右边的终点进行二分查找
                    for( j = 1; j <= end[i][0]; ++j )
                    {
                        ll = 1, rr = start[i+1][0]+1;
                        while( rr - ll > 1 )
                        {
                            mid = (ll+rr)/2;
                            if( start[i+1][mid] <= end[i][j] + 1 )
                                ll = mid;
                            else
                                rr = mid;
                        }
                        if( start[i+1][ll] == end[i][j] + 1 )
                        {
                            temp1[++temp1[0]] = start[i][j];
                            temp2[++temp2[0]] = end[i+1][ll];
                        }
                    }

                    //找不到符合的,匹配失败
                    if( !temp1[0] )
                        ok = 0;
                    else
                    {
                        //把temp中的复制去i+1
                        memcpy(start[i+1], temp1, sizeof(temp1));
                        memcpy(end[i+1], temp2, sizeof(temp2));
                    }
                }
                else
                {
                    for( s[len][0] = e[len][0] = 0, j = 1; j <= end[i][0]; ++j )
                    {
                        //s, e储存合并后的子串
                        s[len][++s[len][0]] = start[i][j];
                        e[len][++e[len][0]] = end[i][j];
                    }
                    ++len;
                }
            }

            //不要忘了最后一个
            for( s[len][0] = e[len][0] = 0, j = 1; j <= end[k][0]; ++j )
            {
                s[len][++s[len][0]] = start[k][j];
                e[len][++e[len][0]] = end[k][j];
            }
            ++len;
        }

        //贪心验证
        if( ok )
        {
            k = -1;
            for( i = 0; ok && i < len; ++i )
            {
                for( j = 1; j <= s[i][0]; ++j )    
					if( s[i][j] > k )
					{
						k = e[i][j]; // 取第一种结果，即匹配的主串片断中长度最小的
						break;
					}
                if( j > s[i][0] )
                    ok = 0;
            }
        }

        if( ok )
		{
            printf("(suffix = include ? : not *) \nYES\nfront = %d\nback = k + cnt1 + cnt2= %d\ns=\n", s[0][1], k + cnt1 + cnt2);
			
			for (int i = 0; i < len; i++)
			{
				printf("%3d: ", s[i][0]);
				for(int j = 1; j <= s[i][0]; j++)
					printf("%3d ", s[i][j]);
					
				printf("\n");
			}
			printf("YES k = %d cnt1 = %d cnt2 = %d\ne=\n", k, cnt1, cnt2);
			for ( i = 0; i < len; i++)
			{
				printf("%3d: ", e[i][0]);
				for(int j = 1; j <= e[i][0]; j++)
					printf("%3d ", e[i][j]);
					
				printf("\n");
			}
		}
        else
            printf("NO k = %d\n", k);

		
		printf("///\n");
    }

    return 0;
}

再给出几组我debug中比较有价值的数据：

abababcdababcdecdabefcdabef
*ab??cd??ef*

abcdebcdde
*abcd?e*

babbbabab
ab?b?bab

abcdef
a*b*c*d*e*f

isdjkasd
i*s*d*j*k*a*s*d

hellokugou
hello*??gou

dfjijijiugnmlok
??*f?ij*ug?ml?k

dfjijijiugnmlok
?*f?ij*ug?ml?k

abcdefghijklmnopqrstuvwxyz
ab*?*ef?h?jk*qr??*u??x?z

sodfmkkoasa
s?df?k?o?*a

sodfmkkoas
s?df?k?o?*