KMP算法是一种改进的字符串匹配算法,由D.E.Knuth、J.H.Morris和V.R.Pratt同时发现。
KMP算法的关键是利用匹配失败后的信息,尽量减少模式串与主串的匹配次数以达到快速匹配的目的。具体实现就是实现一个next()函数,函数本身包含了模式串的局部匹配信息。时间复杂度O(m+n)。
KMP算法中的next函数值只与模式串有关,而与相匹配的主串无关。
主串: a c a b a a b a a b c a c a a b c
模式串: a b a a b c a c
#include<iostream> #include<string> using namespace std; #define MAX_N 100 int Next[MAX_N]; string s1; //主串 string s2; //模式串
next 函数:
void getNext(){
Next[0] = -1;
int i = 0;
int j = -1;
while(i < s2.size()-1){
if(j == -1 || s2[i] == s2[j])
Next[++i] = ++j;
else
j = Next[j];
}
}
index 0 1 2 3 4 5 6 7
s2 a b a a b c a c
next[] -1 0 0 1 1 2 0 1
KMP函数
int KMP(){ int i = 0; //主串的位置 int j = 0; //模式串的位置 getNext(); int x = s1.size(); int y = s2.size(); while (i < x && j < y) { if (j == -1 || s1[i] == s2[j]){ i++; j++; } else j = Next[j]; } if (j == s2.size() ) return i - j; else return -1; }
返回第一次查找到的位置,如果没有找到则返回-1.
实例
acabaabaabcacaabc abaabcac next[] = -1 0 0 1 1 2 0 1 s1.size=17 s2.size=8 0 0 i acabaabaabcacaabc abaabcac j ---------------------- 1 1 i acabaabaabcacaabc abaabcac j ---------------------- 1 0 i acabaabaabcacaabc abaabcac j ---------------------- 1 -1 i acabaabaabcacaabc abaabcac j ---------------------- 2 0 i acabaabaabcacaabc abaabcac j ---------------------- 3 1 i acabaabaabcacaabc abaabcac j ---------------------- 4 2 i acabaabaabcacaabc abaabcac j ---------------------- 5 3 i acabaabaabcacaabc abaabcac j ---------------------- 6 4 i acabaabaabcacaabc abaabcac j ---------------------- 7 5 i acabaabaabcacaabc abaabcac j ---------------------- 7 2 i acabaabaabcacaabc abaabcac j ---------------------- 8 3 i acabaabaabcacaabc abaabcac j ---------------------- 9 4 i acabaabaabcacaabc abaabcac j ---------------------- 10 5 i acabaabaabcacaabc abaabcac j ---------------------- 11 6 i acabaabaabcacaabc abaabcac j ---------------------- 12 7 i acabaabaabcacaabc abaabcac j ---------------------- 13 8 i acabaabaabcacaabc abaabcac j 位置为: 5
我们还可以近一步优化:
显然,当我们上边的算法得到的next数组应该是[ -1,0,0,1 ]
所以下一步我们应该是把j移动到第1个元素咯:
不难发现,这一步是完全没有意义的。因为后面的 B 已经不匹配了,那么前面的 B 也一定不匹配,同样的情况其实还发生在第2个元素A上。
显然,发生问题的原因在于s2[j] == s2[next[j]]。
所以我们也只需要添加一个判断条件即可:
void getNext(){ Next[0] = -1; int i = 0; int j = -1; while(i < s2.size()-1){ if(j == -1 || s2[i] == s2[j]){ if(s2[++i] == s2[++j]) Next[i] = Next[j]; else Next[i] = j; } else j = Next[j]; } }
实例比较:
ABACBCDHI ABAB -1 0 0 1 -1 0 -1 0 9 4 9 4 0 0 0 0 i ....... ABACBCDHI ABAB j ---------------------- 1 1 i ABACBCDHI ABAB j ---------------------- 2 2 i ABACBCDHI ABAB j ---------------------- 3 3 i ABACBCDHI ABAB j ....... ---------------------- --------------------- 3 1 3 0 i i ABACBCDHI ABACBCDHI ABAB ABAB j j ---------------------- ....... 3 0 i ABACBCDHI ABAB j ----------------------
原码:
#include<iostream> #include<string> using namespace std; #define MAX_N 100 int Next[MAX_N]; string s1; //主串 string s2; //模式串 //acabaabaabcacaabc //abaabcac void getNext(){ Next[0] = -1; int i = 0; int j = -1; while(i < s2.size()-1){ if(j == -1 || s2[i] == s2[j]){ if(s2[++i] == s2[++j]) Next[i] = Next[j]; else Next[i] = j; } else j = Next[j]; } } void getNext(){ Next[0] = -1; int i = 0; int j = -1; while(i < s2.size()-1){ //s2为模式串 if(j == -1 || s2[i] == s2[j]) Next[++i] = ++j; else j = Next[j]; } } int KMP(){ int i = 0; //主串的位置 int j = 0; //模式串的位置 getNext(); for(int i = 0; i < s2.size(); i++) cout << Next[i] <<' '; cout<<endl; int x = s1.size(); int y = s2.size(); cout<<x<<' '<<y<<endl; while (i < x && j < y) { cout<<i<<' '<<j<<endl; for(int k = 0; k < i; k++) cout<<' '; cout<<'i'<<endl; cout << s1 << endl; cout << endl << s2 << endl; for(int k = 0; k < j; k++) cout<<' '; cout<<'j'<<endl; cout<<"----------------------"<<endl; if (j == -1 || s1[i] == s2[j]){ i++; j++; } else j = Next[j]; } cout<<i<<' '<<j<<endl; for(int k = 0; k < i; k++) cout<<' '; cout<<'i'<<endl; cout << s1 << endl; cout << endl << s2 << endl; for(int k = 0; k < j; k++) cout<<' '; cout<<'j'<<endl; if (j == s2.size() ) return i - j; else return -1; } int main(){ cin >> s1; cin >> s2; //getNext(); cout<< KMP(); getchar(); return 0; }
计数
while (i < x && j < y) { if (j == -1 || s1[i] == s2[j]){ if(j == -1) sum--; i++; j++; } else j = Next[j]; sum ++; }