理解KMP模式匹配算法

最新推荐文章于 2024-07-27 09:54:50 发布

帝王铠

最新推荐文章于 2024-07-27 09:54:50 发布

阅读量779

点赞数

文章标签： C语言

本文链接：https://blog.csdn.net/u010913001/article/details/38958021

版权

</pre><pre class="cpp" name="code">书上朴素的匹配算法（回朔主串ch1 i指针，子串ch2 j指针）

while (i <= ch1[0] && j <= ch2[0]){
		if(ch1[i] == ch2[j]){
			++i;
			++j;
			printf("%c", ch1[i]);
		}
		else{
			i = i-j+2;//回朔i
			j =1;//回朔j
		}
	}

我自己开始写的匹配算法（不回朔i，但回朔j）

while (i <= ch1[0] && j <= ch2[0]){
		if(ch1[i] == ch2[j]){
			++i;
			++j;
			printf("%c", ch1[i]);
		}
		else{
			i++;//不回朔i
			j =1;//回朔j
		}
	}

</pre></p><p>KMP算法。比我的更好，更复杂（不回朔i，j）（不带nextval[]）</p><pre class="cpp" name="code">
int next[100];
void getnext(char b[]){
	int i=1,j=0;//i是每个位子，j是回退的位子
	next[1]=0;
	while(i <= strlen(b)){
		if(j == 0||b[i-1] == b[j-1]){
		i++;
		j++;
		next[i] = j;
		}
		else
		j=next[j];//用上一个的回退关系
	}
}
int kmp(char ch1[],char ch2[])
{
	int i=1,j=1;//i是主串中的位子，j匹配串的位子
	while(i<=strlen(ch1)&&j<=strlen(ch2))
	{
		if(j==0||ch1[i-1]==ch2[j-1])
		{
			i++;
			j++;
		}
		else
			j=next[j];
	}
	if(j>strlen(ch2))return i-strlen(ch2);
	else
	return 0;
}

手算next[i]值

我们令 next[0] = 0 。从 next[1] 开始，每求一个字符的 next 值，就看它前面是否有一个最长的"字符串"和从第一个字符开始的"字符串"相等(需要注意的是，这2个"字符串"不能是同一个"字符串")。如果一个都没有，这个字符的 next 值就是1；如果有，就看它有多长，这个字符的 next 值就是它的长度。

理解：就是说模式串自己和自己匹配，错一位开始自己匹配，当前字符前面有没有重合字符，如aasd的next值分别是0121，第一个a默认0，第二个a默认1（模式匹配时从模式串第一个开始，所以默认1），第三个s前面有个a（模式匹配时可以不回朔到最开始的a回朔到第二个a就好，位置是模式串的2）第四个d模式匹配只能从第1个来就是1。

计算修正后的 Nextval[i] 值:

我们令 nextval[0] = 0。从 nextval[1] 开始，如果某位(字符)与它 next 值指向的位(字符)相同，则该位的 nextval 值就是指向位的 nextval 值(nextval[i] = nextval[ next[i] ])；如果不同，则该位的 nextval 值就是它自己的 next 值(nextvalue[i] = next[i])。

理解：这个是升级版本，为了解决aaaab和aaabaaaaab的匹配问题，你可以看到当三a一b被匹配失败时，按原理还要进行第二个a第三个a的匹配，何不滑远点直接对b判断甚至对b后的a进行判断，这就是产生nextval的原因。相应的

j 1 2 3 4 5

模式串 a a a a b

next 0 1 2 3 4

nextval0 0 0 0 4

void Nextval(char*ch2,int*next){

	i = 1; nextval[1] = 0; j = 0;
	while(i < strlen(ch2)){
		if( j == 0 || ch2[i] == ch2[j]){
			++i;
			++j;
			if(ch2[i] != ch2[j])
				nextval[i] = j;
			else nextval[i] = nextval[j];
		}
		else
			j = nextval;
	}

另外

<pre class="cpp" name="code">void GetNextEx(char*ch2,int*next)
{
	int k=1,j=0;
	next[1] = 0;
	while(k<strlen(T))
	{
		if(j == 0||T[k] == T[j])
		{
			++k;
			++j;
			if(T[k]==T[j])
			next[k]=next[j];
		else
			next[k]=j;
		}
		else
			j=next[j];
	}
}

和朴素算法相比，只是修改一句话而已（？），但是算法复杂度从O(m*n) 变成了：O(m+n)

完整代码by：recruits

#include <stdio.h>
#include <string>

#define RST_OVERFLOW -1;
#define RST_ERROR 0;
#define RST_OK 1;

int *nextVal=NULL;

int getNextVal(const char *pStr, int *nextVal){
    if(NULL == pStr || strlen(pStr) == 0){
        return RST_ERROR;
    }

    int j=0; // 用j循环模式串中的字符
    int k=-1; // 用k保存 nextVal 的值
    nextVal[0] = -1;

    while(j < strlen(pStr)){
        if(k == -1 || pStr[j] == pStr[k]){
            ++j; ++k;
            if(pStr[j] != pStr[k]){
                nextVal[j] = k;
            } else {
                nextVal[j] = nextVal[k];
            }
        } else {
            k = nextVal[k];
        }
    }

    return RST_OK;
}

int indexOfStrPos(const char *srcStr, const char *subStr, int pos){
    if(NULL == srcStr || NULL == subStr || pos < 0 || pos > strlen(srcStr) - 1){
        return RST_ERROR;
    }

    int i=pos,j=0;
    int srcLen = (int)strlen(srcStr);
    int subLen = (int)strlen(subStr);
    while(i < srcLen && j < subLen){
        if(j == -1 || srcStr[i] == subStr[j]){
            ++i; ++j;
        } else {
            j = nextVal[j];
        }
    }

    if(j >= subLen){
        return i - j;
    } else {
        return RST_ERROR;
    }
}

int indexOfStr(const char *srcStr, const char *subStr){
    return indexOfStrPos(srcStr, subStr, 0);
}

int main(int argc, const char * argv[]){
    char *test = "asldkalalskdblalskdl";
    char *p = "lalskdl";

    nextVal = (int *)malloc(strlen(p) * sizeof(int));
    getNextVal(p, nextVal);

    for (int i=0; i<strlen(p); i++) {
        printf("--->nextVal[%d] is :%d\n", i, nextVal[i]);
    }

    int rst = indexOfStr(test, p);
    int rst2 = indexOfStrPos(test, p, 15);
    printf("%d---%d\n", rst, rst2);
    return 0;
}