【模板】KMP字符串匹配

Selvaggia

已于 2022-03-26 12:37:57 修改

阅读量1.1k

点赞数 1

分类专栏：字符串数据结构文章标签：动态规划算法

于 2022-02-13 09:39:52 首次发布

本文链接：https://blog.csdn.net/qq_51070956/article/details/122903046

版权

数据结构同时被 2 个专栏收录

28 篇文章 0 订阅

订阅专栏

字符串

14 篇文章 0 订阅

订阅专栏

KMP字符串匹配中next数组求法

char* strstr(char*str,char*pattern);//字符串很小O(m.n)

子串里面一头一尾有可以匹配上的小串，指针不用每次都回溯到最最开头的位置。
匹配前，对模式串进行详细分析, 搞清楚next值的含义，
p[0 ~ j]==p[ (i-j) ~i] ,即从p[0]开始长度为len=j+1,都有p[j]=
p[i] 那么一旦p[i+1]与t[?]不相等了，t[?]下一个就要与p[j+1]比较，next[i+1]=j+1
例如next[6]=3 ，模式串从0-5这个子串中，首尾能匹配的
小串从0开始这个小串尾部的下标再往后挪一位

其实我们也可以发现， KMPKMP 算法之所以快，不仅仅由于它的失配处理方案，更重要的是利用前缀后缀的特性，从不会反反复复地找，我们可以看到代码里对于匹配只有一重循环，也就是说 KMPKMP 算法具有一种“最优历史处理”的性质，而这种性质也是基于 KMP 的核心思想的。

在这里插入图片描述
t[?]与p[0]都匹配失败了，
-1代表主串里面这个点不可能成功了，主串往后挪一个位置
把模式串最前面对着失败的位置继续匹配

如果是与 $p [x] (x! = 0)$ 比较失败了，那么t[?]与next[x]匹配

这种形式可以成功获得匹配的位置，~~但是有一个缺陷，next数组的不能正确反映模式串中首尾匹配的小串的真正长度~~
没有缺陷，根据观察比对，发现next【i】 $i \in (0 — — p . s i z e () - 1)$ 表示了模式串前i个字符，即0~ i-1的这一段首尾匹配小串的长度，可是0~p.size()-1整个模式串这一段的首尾匹配小串长度却不可知。观察getNext（）函数里对next数组的求值就可以发现，next数组的长度是 $p . s i z e () + 1$ 而不是 $p . s i z e ()$ ,也就是存在 $n e x t [p . s i z e ()]$ , 可以表示0~p.size()-1整个模式串首尾匹配小串长度。
在这里插入图片描述

【模板】KMP字符串匹配

模板】KMP字符串匹配

#include <iostream>
#include <algorithm>
using namespace std;
const int N=1e6+5;
int next[N];
void getNext(string p){
	next[0]=-1;
	int i=0;
	int j=-1;
	//在模式匹配串中首尾匹配的小串中，i指向尾小串最后一个字符
//	 j指向首小串最后一个字符，
//	 也就是p[i+1]匹配不上时指针回溯的位置 j+1就要靠j定位 
	int len=p.size();//strlen 
	while(i<len){
		if(j==-1||p[i]==p[j]){//退回起点就要重新匹配
			i++;j++;//首尾小串依旧匹配就继续向前
			next[i]=j;//相当于是p[i]=p[j]则next[i+1]=j+1 
			//如果i+1匹配失败，就从j+1开始比较，因为i，j对应的元素相等
//至于j==-1，就是指针回溯到从-1开始比较，p[i+1]肯定就是从j==0开始比较
 
		}
		else{
			j=next[j];//如果p[i]！=p[j]，那么 p[i]就和next[j]比较 
		} 
	} 
}
void kmp(string t,string p){
	int lt=t.size();
	int lp=p.size();
	int i=0;
	int j=0;
	while(i<lt&&j<lp){
		if(j==-1||t[i]==p[j]){
			i++;
			j++;
		}
		else j=next[j];//这样j就可能为-1，继而想到if中要判断j==-1 
		//j==-1模式串从0开始即j++，t[i]正是不能与p[0]匹配 
		if(j==(lp)){
		cout<<i-lp+1<<endl;
		j=next[j];//回溯到p【0】
	}	
}	
} 
int main(){
	string t;
	string p;
	cin>>t>>p;
	getNext(p); 
 	kmp(t,p);
 	for(int i=1;i<=p.size();i++){
	 //next数组长度有p.size()+1,
//	 后p.size()个值才是模式串里首尾匹配小串的长度 
 		cout<<next[i]<<" ";
	 }

    return 0;
}

还有种写法，避免next值取-1的

#include<iostream>
#include<cstring>
#define MAXN 1000010
using namespace std;
int next[MAXN];
int la,lb,j; 
char a[MAXN],b[MAXN];
int main()
{
    cin>>a+1;
    cin>>b+1;
    la=strlen(a+1);
    lb=strlen(b+1);
    for (int i=2;i<=lb;i++)
	   {     
	   while(j&&b[i]!=b[j+1])
        j=next[j];    
       if(b[j+1]==b[i])j++;    
        next[i]=j;
       }
    j=0;
    for(int i=1;i<=la;i++)
	   {
          while(j>0&&b[j+1]!=a[i])
           j=next[j];
          if (b[j+1]==a[i]) 
           j++;
          if (j==lb) {cout<<i-lb+1<<endl;j=next[j];}
       }

    for (int i=1;i<=lb;i++)
    cout<<next[i]<<" ";
    return 0;
}

错乱修正后总结

KMP模板一

#include<iostream>
using namespace std;
//#define N 1e6+5
const int N=1e6+5;
int next[N];
void getNext(string p){
	int len=p.size();
	int i=0;
	int j=-1;
	next[0]=-1;
	while(i<len){
		if(j==-1||p[i]==p[j]){
			next[++i]=++j; //next[i+1]=j+1;

/*			i++;j++;
			if(p[i]==p[j])next[i]=next[j];
			else next[i]=j;//p[i]!=p[j]
	next[i]指p[i]匹配不上t[?]时,让p[next[i]]去匹配t[?],如果
	p[i]==p[j]的情况下，还让next[i]=j,让p[j]去匹配t[?],
	一定是不成功的，所以直接跳过这一步去和next[j]比较
	
	!!!当只是寻找匹配位置时可以这样优化!!!要用到next数组时就算了吧
	next值本该是0的位置变成了-1??? 
*/ 
		}
		else j=next[j];
	}
}
void kmp(string t,string p){
	int lt=t.size();
	int lp=p.size();
	int i=0;
	int j=0;
	while(i<lt){
		if(j==-1||t[i]==p[j]){
			i++;
			j++;
		}
		else j=next[j];
	//由这一步知晓j随时可能为-1，应将主串右移一位与p[0]比较 
	if(j==lp){//注意不要写出while循环 
		cout<<i-lp+1<<endl;
		j=next[j];//由这里也可以看见next数组长度是lp+1，next[lp]表示
//		整个模式串首尾小串的长度 
	}
	}
} 
int main(){
	string t;
	string p;
	cin>>t>>p;
	getNext(p); 
 	kmp(t,p);
 	for(int i=1;i<=p.size();i++){
	 //next数组长度有p.size()+1,
	//后p.size()个值才是模式串里首尾匹配小串的长度 
 		cout<<next[i]<<" ";
	 }
    return 0;
}

模板一：
读入字符串从0开始
next[i]数组存放当i不匹配时，下一次比较从next[i]开始。而数组从0开始，0就代表第一个数，1代表第二个数。

p[i]匹配不上t[?]时,让p[next[i]]去匹配t[?]

模板二：
读入字符串从1开始
并且next【i】存放以i-1为结尾从头开始有多少个数相匹配，
0代表无，1代表一个。
p[i+1]配不上t[?]时，用p[next[i]+1]去匹配t[?]
因为p[i]==p[next[i]],对准相同段的最后一个字符x，x后一个字前赴后继

模板一和模板二的next数组都可以用于处理循环字符串问题，模板一、二中next数组都可以准确存放匹配数量，next[i]都是表示前面长度为i的子串中，前缀和后缀相等的最大长度。

虽然含义不同，模板二next【i】直接表示模式串中1~i这一段的首尾匹配小串长度，next[i]+1去比较适配位置要匹配的字符，所以以j为匹配段的下标，关心的永远是j+1

模板一，next【i】表示模式串中0~i-1这一段的首尾匹配小串长度，next【i】就是适配位置要匹配的字符

KMP模板二

#include<iostream>
#include <string.h>
using namespace std;
//#define N 1e6+5
const int N=1e6+5;
int next[N];
char t[N];
char p[N];
void getNext(){
	int len=strlen(p+1);
	int j=0;
	next[0]=0;//其实全局变量next所有值都初始化为0了噢 
	for(int i=2;i<=len;i++){//求next[i] 
		while(j>0&&p[i]!=p[j+1])j=next[j];
		if(p[i]==p[j+1])j++;
		next[i]=j; 
	}
}
void kmp(){
	int lt=strlen(t+1);
	int lp=strlen(p+1);	
	int j=0;
	for(int i=1;i<=lt;i++){
		while(j&&t[i]!=p[j+1]){
			j=next[j];
		} 
		if(t[i]==p[j+1]){
			j++;
		}
		if(j==lp){
			cout<<i-lp+1<<endl;
			j=next[j];
		}
	}
} 
int main(){
	cin>>t+1;
	cin>>p+1;
	getNext(); 
 	kmp();
 	for(int i=1;i<=strlen(p+1);i++){
	 //next数组长度有p.size()+1,
	//后p.size()个值才是模式串里首尾匹配小串的长度 
 		cout<<next[i]<<" ";
	 }
    return 0;
}

KMP最小循环节、循环周期：

定理：假设S的长度为len，则S存在最小循环节，最小循环节的长度L为 $l e n - n e x t [l e n]$ ,子串为 $S [0 \dots （ l e n - n e x t [l e n] - 1 ）]$ 。

（1）如果 $l e n$ 可以被 $l e n - n e x t [l e n]$ 整除，则表明字符串S可以完全由循环节循环组成，循环周期T=len/L。

（2）如果不能，说明还需要再添加几个字母才能补全。需要补的个数是循环个数L-len%L=L-(len-L)%L=L-next[len]%L，L=len-next[len]。
在这里插入图片描述
以上图片来自该博文，感谢博主

对于一个字符串，如abcd abcd abcd，由长度为4的字符串abcd重复3次得到，那么必然有原字符串的前八位等于后八位。

也就是说，对于某个字符串S，长度为len，由长度为L的字符串s重复R次得到，当R≥2时必然有S[0…len-L-1]=S[L…len-1]，字符串下标从0开始

那么对于KMP算法来说，就有next[len]=len-L。此时L肯定已经是最小的了（因为next的值是前缀和后缀相等的最大长度，即len-L是最大的，那么在len已经确定的情况下，L是最小的）

POJ1961 Period

Description

For each prefix of a given string S with N characters (each character has an ASCII code between 97 and 126, inclusive), we want to know whether the prefix is a periodic string. That is, for each i (2 <= i <= N) we want to know the largest K > 1 (if there is one) such that the prefix of S with length i can be written as $A^K$ ,that is A concatenated K times, for some string A. Of course, we also want to know the period K.
Input

The input consists of several test cases. Each test case consists of two lines. The first one contains N (2 <= N <= 1 000 000) – the size of the string S.The second line contains the string S. The input file ends with a line, having the
number zero on it.
Output

For each test case, output “Test case #” and the consecutive test case number on a single line; then, for each prefix with length i that has a period K > 1, output the prefix size i and the period K separated by a single space; the prefix sizes must be in increasing order. Print a blank line after each test case.
Sample Input

3
aaa
12
aabaabaabaab
0
Sample Output

Test case #1
2 2
3 3

Test case #2
2 2
6 2
9 3
12 4

#include<iostream>
#include <string>
using namespace std;
//#define N 1e6+5
const int N=1e6+5;
int next[N];
void getNext(string p){
	int len=p.size();
	int i=0;
	int j=-1;
	next[0]=-1;
	while(i<len){
		if(j==-1||p[i]==p[j]){
			i++;
			j++;
			next[i]=j;
		}
		else j=next[j];
	}
} 
int main(){
	int len;
	string p;
	int cnt=0;
	while(cin>>len&&len){
		cin>>p;
	//len/(len-next[len])
	getNext(p);
	cout<<"Test case #"<<(++cnt)<<endl;
	for(int i=2;i<=len;i++){
		int l=i-next[i];
		if(i%l==0&&i/l!=1){
			cout<<i<<" "<<i/l<<endl;
		}
	}
	cout<<endl;
	}
	
    return 0;
}

#include<iostream>
#include <string>
using namespace std;
//#define N 1e6+5
const int N=1e6+5;
int next[N];
char p[N];
int len;
void getNext(){
	int j=0;
	next[1]=0;
	for(int i=2;i<=len;i++){
		while(j&&p[j+1]!=p[i]){
			j=next[j];
		}
		if(p[j+1]==p[i]){
			j++; 
		}
		next[i]=j;//前i个字符的首尾匹配小串的长度 
	}
}
int main(){
	int cnt=0;
	while(cin>>len&&len){
	cin>>p+1;
	//len/(len-next[len])
	getNext();
	cout<<"Test case #"<<(++cnt)<<endl;
	for(int i=2;i<=len;i++){
		int l=i-next[i];
		if(i%l==0&&i/l!=1){
			cout<<i<<" "<<i/l<<endl;
		}
	}
	cout<<endl;
	}
	
    return 0;
}

可见按照模板一二时由next数组获得最小循环节长度的操作是一样的

for(int i=2;i<=len;i++){
		int l=i-next[i];
		if(i%l==0&&i/l!=1){
			cout<<i<<" "<<i/l<<endl;
		}
	}

Power Strings(不能说和上一题很像，那就是一模一样

添加链接描述
Power Strings
Time Limit: 3000MS Memory Limit: 65536K
Total Submissions: 77212 Accepted: 31818
Description

Given two strings a and b we define ab to be their concatenation. For example, if a = “abc” and b = “def” then ab = “abcdef”. If we think of concatenation as multiplication, exponentiation by a non-negative integer is defined in the normal way: a ^ 0 = “” (the empty string) and a ^ (n+1) = a*(a^n).
Input

Each test case is a line of input representing s, a string of printable characters. The length of s will be at least 1 and will not exceed 1 million characters. A line containing a period follows the last test case.
Output

For each s you should print the largest n such that s = a^n for some string a.
Sample Input

abcd
aaaa
ababab
.
Sample Output

1
4
3
Hint

This problem has huge input, use scanf instead of cin to avoid time limit exceed.

#include<iostream>
#include <string.h>
using namespace std;
//#define N 1e6+5
const int N=1e6+5;
int next[N];
char p[N];
int len;
void getNext(){
	int j=0;
	next[1]=0;
	for(int i=2;i<=len;i++){
		while(j&&p[j+1]!=p[i]){
			j=next[j];
		}
		if(p[j+1]==p[i]){
			j++; 
		}
		next[i]=j;//前i个字符的首尾匹配小串的长度 
	}
}
int main(){
	ios::sync_with_stdio(false);
	cin.tie(0);
	int cnt=0;
	while(cin>>p+1){
	if(p[1]=='.') break;
	len=strlen(p+1); 
	//len/(len-next[len])
	getNext();
//	int maxn=-1;
//	for(int i=2;i<=len;i++){
//		int l=i-next[i];
//		if(i%l==0){
//			maxn=max(maxn,i/l);
//		}
//	}
	int n=1;
	if(len%(len-next[len])==0)n=len/(len-next[len]);
	cout<<n<<endl;
	}
    return 0;
}

最小循环节用字符串哈希解法

发现char中的字母有256个，要找大于256的质数，犯难了，索性编个线性筛，杀鸡用牛刀。
如何检验线性筛的正确性，请注意，2-100这个区间的质数有25个。

找到一个质数257.

#include <cstdio>
#include <cstring>
#define maxn 1000010
#define ULL unsigned long long
#define bas 257
using namespace std;
char a[maxn];
ULL h[maxn],p[maxn];
int main(){
	while(true){
		scanf("%s",a+1);
		if(a[1]=='.')break;
		int len=strlen(a+1);
		p[0]=1;
		for(int i=1;i<=len;i++)p[i]=p[i-1]*bas;//B进制 
		h[0]=0;
		for(int i=1;i<=len;i++)h[i]=h[i-1]*bas+a[i];//哈希值 
		for(int le=1;le<=len;le++)
			if(len%le==0){//最小循环节长度为i   abcabcabc
				int j;
				for( j=0;j+le<=len;j+=le)//此行j+=le比较难编，注意j是跳跃的 
					if(h[j+le]-h[j]*p[le]!=h[le])break;
//要是这一句不适应就写一个getH函数噢
//return h[r]-h[l-1]*p[r-l+1];
//请注意，r-l+1就是这段字符串的长度，l-1指向的是这段字符串前一个字符 
				if(j+le>len){
					printf("%d\n",len/le);
					break;
				}
			}
	}
	return 0;
}

KMP&字符串哈希模板

http://poj.org/problem?id=3461
这道题跟POJ2406很相似，Power Strings那道题是求abcabcabc，则就是3次，最小循环节
而这道题，abababa中aba出现了3次，kmp的裸题

Seek the Name, Seek the Fame

http://poj.org/problem?id=2752

#include<iostream>
#include <stack> 
#include <string.h>
using namespace std;
//#define N 1e6+5
const int N=1e6+5;
int next[N];
char p[N];
int len;
void getNext(){
	int j=0;
	next[1]=0;
	for(int i=2;i<=len;i++){
		while(j&&p[j+1]!=p[i]){
			j=next[j];
		}
		if(p[j+1]==p[i]){
			j++; 
		}
		next[i]=j;//前i个字符的首尾匹配小串的长度 
	}
}
stack<int> st;
int main(){
	while(cin>>p+1){
		len=strlen(p+1); 
	getNext();
	st.push(len);
	while(next[len]){
		st.push(next[len]);
		len=next[len];
	}
	while(!st.empty()){
		cout<<st.top()<<" ";
		st.pop();
	}
	cout<<endl;
	}
	
    return 0;
}