KMP学习笔记

最新推荐文章于 2022-05-11 21:59:10 发布

Bil!

最新推荐文章于 2022-05-11 21:59:10 发布

阅读量179

点赞数 1

分类专栏：左神笔记

本文链接：https://blog.csdn.net/weixin_45045689/article/details/107431278

版权

左神笔记专栏收录该内容

11 篇文章 1 订阅

订阅专栏

KMP算法是一种字符串匹配算法，可以在 O(n+m) 的时间复杂度内实现两个字符串的匹配。

我觉得KMP关键就两点：1.实现在匹配的过程中，主串不回退。 2.用最长前缀等于最长后缀的性质快速求出next[]数组。

我们先看关键点一：为什么要实现，在匹配的过程中，主串不回退？？

用暴力来解决洛谷3375 【模板】KMP字符串匹配这题试试。

题目描述

给出两个字符串 s1 和 s2，若 s1 的区间 [l, r] 子串与 s2 完全相同，则称 s2 在 s1 中出现了，其出现位置为 L。
现在请你求出 s2 在 s1 中所有出现的位置。

定义一个字符串 s 的 border 为 s 的一个非 s 本身的子串 t，满足 t 既是 s 的前缀，又是 s 的后缀。
对于 s2，你还需要求出对于其每个前缀 s′ 的最长 border t′ 的长度。

输入样例：

ABABABC
ABA

输出样例：

1
3
0 0 1

用暴力解法就没有next数组这一说了。。

#include<string>
#include<iostream>
using namespace std;

void bruteForce(string S, string P)
{
	int lenS = S.length(), lenP = P.length();
	int flag;

	for (int i = 0; i <= lenS - lenP; i++){
		flag = true;

		for (int j = 0; j < lenP; j++){
			if (S[i + j] != P[j]){
				flag = false;
				break;
			}
		}//inner for
		if (flag){
			cout << i + 1 << endl;
		}//if

	}//for

}


int main()
{
	string S, P;
	cin >> S >> P;
	
	bruteForce(S, P);

	return 0;
}

时间复杂度为O(N * M) ，太慢了。

我们用next数组跳过那些一定不成功的比较。那什么是next数组呢？？(next数组的定义在各教材中有所区别。)

定义一：

　next数组是对于模式串而言的。P 的 next 数组定义为：next[i] 表示 P[0] ~ P[i] 这一个子串，使得 前k个字符恰等于后k个字符 的最大的k. 特别地，k不能取i+1（因为这个子串一共才 i+1 个字符，自己肯定与自己相等，就没有意义了）。

定义二：

next 数组相当于 最大长度值 整体向右移动一位，然后初始值赋值为 -1。 (这个 最大长度值 指定义一中的 next数组)

因为next数组的定义不同，所以导致用next数组进行匹配的代码有点不同。很明显，该题要我们输出定义一中的next数组，不过，我更偏向用定义二中的next数组。

这张图很好的解释了用next数组快速匹配的核心：

图片来源：https://www.zhihu.com/question/21923021

#include<string>
#include<iostream>
using namespace std;
const int maxn = 1e6 + 5;

int* getNextArray(string P)
{
	static int next[maxn];
	next[0] = 0;  // 只有0号下标一个元素， 最长前缀 和 最长后缀 的长度 肯定为0

	int PLen = P.length();
	int i, j;

	for (i = 1; i < PLen; i++){ //枚举 模式串的下标
		for (j = i; j >= 0; j--){  //枚举 最长前缀 和 最长后缀 的长度

			if (P.substr(0, j).compare(P.substr(i - j + 1, j)) == 0){
				next[i] = j;
				break;
			}//if
		}//inner for

	}//extren for

	//整体向后移一位，同时第0为赋值为-1
	for (i = PLen; i >= 1; i--){
		next[i] = next[i - 1];
	}//for
	next[0] = -1;

	return next;
}


void KMP(string S, string P)
{
	int i1 = 0, i2 = 0;
	int SLen = S.length(), PLen = P.length();

	int *next = getNextArray(P);  //获得next数组

	while (i1 < SLen){
		if (S[i1] == P[i2]){
			i1++;
			i2++;
		}
		else if (i2){
			i2 = next[i2];
		}
		else if (next[i2] == -1){
			i1++;
		}

		if (i2 == PLen){
			cout << i1 - i2 + 1 << endl;
			i1--;
			i2 = next[i2 - 1];
		}//if
	}//while

	for (int i = 1; i <= PLen; i++){
		cout << next[i] << " ";
	}
	cout << endl;
}


int main()
{
	string S, P;
	cin >> S >> P;
	KMP(S, P);

	return 0;
}

这样求next数组，时间复杂度为O(m * m)

我们来到了 KMP 的要点2 ：用最长前缀等于最长后缀的性质快速求出next[]数组。

毕竟是由数据状况和要解决的问题出发思考算法嘛。用两个例子来讲清楚怎么实现吧！

比如我想得到下标为 10 的next数组，及在蓝色括号内的找最长前缀和最长后缀。可以知道下标为9 的next数组的值为4。如果在模式串中 i - 1 位置上的字符与 cn 位置上的字符相等。说明可以从i - 1 得到的最长前缀和最长后缀的基础上追加一个 cn位置上的字符。在这个例子中，就是从原来abab 变为 ababc 。所以next 数组的值为 next[i - 1] + 1。

如果在模式串中 i - 1 位置上的字符与 cn 位置上的字符不相等呢？？

我们就看next[cn] 上的值，并把这个值赋值给cn。比如在下面这个例子中，在模式串中 i - 1 和 cn 的值不相等。而 next[cn] == 2 。

看模式串下标为 cn 时，与下标为 i -1 上的值是否相等。如果相等，那么next[i] = cn。

如果还不相等，那么继续进行这个操作。如果cn 为 0 了，模式串下标为 cn ，与下标为 i -1 上的值还不相等，那么next[i] = 0.

总结：

cn 为 i - 1位置上next数组最长前缀的后一个位置，i-1 为 i - 1 位置上next数组最长后缀的后一个位置。

#include<string>
#include<iostream>
using namespace std;
const int maxn = 1e6 + 5;

int* getNextArray(string P)
{
	static int next[maxn];
	next[0] = -1;
	int PLen = P.length();

	if (PLen == 1){
		return next;
	}//if

	next[1] = 0;
	int i = 2;
	int cn = 0;
	while (i <= PLen){  //得到的next[PLen] 是没有用到KMP中的
		if (P[i - 1] == P[cn]){
			next[i++] = ++cn;
		}//if
		else if (cn > 0){
			cn = next[cn];
		}//else if
		else{
			next[i++] = 0;
		}//else
	}//while

	return next;
}


void KMP(string S, string P)
{
	int i1 = 0, i2 = 0;
	int SLen = S.length(), PLen = P.length();

	int *next = getNextArray(P);  //获得next数组

	while (i1 < SLen){
		if (S[i1] == P[i2]){
			i1++;
			i2++;
		}
		else if (i2){
			i2 = next[i2];
		}
		else if (next[i2] == -1){
			i1++;
		}

		if (i2 == PLen){
			cout << i1 - i2 + 1 << endl;
			i1--;
			i2 = next[i2 - 1];
		}//if
	}//while

	for (int i = 1; i <= PLen; i++){
		cout << next[i] << " ";
	}
	cout << endl;
}


int main()
{
	//freopen("D:\\in.txt","r",stdin);
	string S, P;
	cin >> S >> P;
	KMP(S, P);

	return 0;
}

再来几道题加深印象：

剪花布条

HDU - 2087

题意：在一个主串中，找可以分为多少个模式串。模板题。

#pragma GCC optimize(3,"Ofast","inline")
#include<iostream>
#include<cstdio>
#include<algorithm>
#include<string>
#include<vector>
#include<cstring>
#include<queue>
#include<stack>
#include<list>
#include<map>
#include<set>
#include<cmath>
#include<sstream>
#include<cstdlib>
#include<bitset>
#include<climits>
#define F(i,s,t) for(int i=(s);i<=(t);i++)
#define D(i,s,t) for(int i=(s);i>=(t);i--)
#define dBug(i) printf("Value=%d\n",i)
#define ddBug(i,j) printf("Value=%d %d\n",i,j)
#define ed putchar('\n')
#define FO freopen("D:\\in.txt","r",stdin)
#define IOS cin.tie(0) ,cout.tie(0), cout.sync_with_stdio(0)
typedef long long ll;
//const int INF = 1 << 30;
//const double EPS = 1e-6;
//#define MX 102
//#define Mod 10000
using namespace std;
const int maxn = 1e6 + 5;

int* getNextArray(string P)
{
	static int next[maxn];
	next[0] = -1;
	int PLen = P.length();

	if (PLen == 1){
		return next;
	}

	next[1] = 0;
	int i = 2;
	int cn = 0;
	while (i <= PLen){
		if (P[i - 1] == P[cn]){
			next[i++] = ++cn;
		}//if
		else if (cn > 0){
			cn = next[cn];
		}//else if
		else{
			next[i++] = 0;
		}

	}//while

	return next;
}

int KMP(string S, string P)
{
	int i1 = 0, i2 = 0;
	int SLen = S.length(), PLen = P.length();
	int result = 0;

	int *next = getNextArray(P);

	while (i1 < SLen){
		if (S[i1] == P[i2]){
			i1++;
			i2++;
		}//if
		else if (i2){
			i2 = next[i2];
		}//else if
		else if (next[i2] == -1){
			i1++;
		}

		if (i2 == PLen){
			result++;
			i2 = 0;
		}//if
	}//while

	return result;
}


int main()
{
	string S, P;
	while (cin >> S){
		if (S.compare("#") == 0){
			break;
		}//if
		cin >> P;

		cout << KMP(S, P) << endl;
		S.clear();
		P.clear();
	}//while
	return 0;
}

Simpsons’ Hidden Talents

HDU - 2594

题意：

两个串，找到第一个串前缀等于第二个串的后缀的子序列。

题解:

把这两个串连接起来，再求它的next数组，可是next数组的值不可以大于两个串的长度。

#pragma GCC optimize(3,"Ofast","inline")
#include<iostream>
#include<cstdio>
#include<algorithm>
#include<string>
#include<vector>
#include<cstring>
#include<queue>
#include<stack>
#include<list>
#include<map>
#include<set>
#include<cmath>
#include<sstream>
#include<cstdlib>
#include<bitset>
#include<climits>
#define F(i,s,t) for(int i=(s);i<=(t);i++)
#define D(i,s,t) for(int i=(s);i>=(t);i--)
#define dBug(i) printf("Value=%d\n",i)
#define ddBug(i,j) printf("Value=%d %d\n",i,j)
#define ed putchar('\n')
#define FO freopen("D:\\in.txt","r",stdin)
#define IOS cin.tie(0) ,cout.tie(0), cout.sync_with_stdio(0)
typedef long long ll;
//const int INF = 1 << 30;
//const double EPS = 1e-6;
//#define MX 102
//#define Mod 10000
using namespace std;
const int maxn = 5e4 + 5;

int* getNextArray(char* P)
{
	static int next[maxn * 2];
	next[0] = -1;
	int PLen = strlen(P);

	if (PLen == 1){
		return next;
	}

	next[1] = 0;
	int i = 2;
	int cn = 0;
	while (i <= PLen){
		if (P[i - 1] == P[cn]){
			next[i++] = ++cn;
		}//if
		else if (cn > 0){
			cn = next[cn];
		}//else if
		else{
			next[i++] = 0;
		}

	}//while

	return next;
}

int main()
{
	char s1[maxn], s2[maxn];
	int *Next;
	int s1L, s2L, ans;

	while (scanf("%s %s", s1, s2) != EOF){
		s1L = strlen(s1);
		s2L = strlen(s2);
		strcat(s1, s2);
		Next = getNextArray(s1);

		ans = Next[s1L + s2L];
		while (ans > s1L || ans > s2L){
			ans = Next[ans];
		}//inner while

		if (ans){
			for (int i = 0; i < ans; i++){
				printf("%c",s1[i]);
			}//for
			printf(" %d\n",ans);
		}//if
		else{
			printf("0\n");
		}
	}//while


	return 0;
}

Bil!

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
KMP学习笔记

KMP算法是一种字符串匹配算法，可以在 O(n+m) 的时间复杂度内实现两个字符串的匹配。我觉得KMP关键就两点：1.实现在匹配的过程中，主串不回退。 2.用最长前缀等于最长后缀的性质快速求出next[]数组。我们先看关键点一：为什么要实现，在匹配的过程中，主串不回退？？用暴力来解决洛谷3375 【模板】KMP字符串匹配这题试试。题目描述给出两个字符串 s1和s2，若s1的区间[l, r]子串与s2完全相同，则称s2在s1中...
复制链接

扫一扫