数据结构-数组-字符串匹配：Knuth-Morris-Pratt算法(详解附完整代码)

最新推荐文章于 2023-06-08 17:43:24 发布

一棵___大树

最新推荐文章于 2023-06-08 17:43:24 发布

阅读量434

点赞数

分类专栏：数据结构文章标签：数据结构算法 c++

本文链接：https://blog.csdn.net/weixin_53580595/article/details/120306187

版权

数据结构专栏收录该内容

14 篇文章 2 订阅

订阅专栏

字符串匹配

字符串抽象数据类型
字符串模式匹配
代码总览

字符串抽象数据类型

C++语言中包含一个string类，其ADT中包含很多定义的函数，这里就不再详细赘述。

字符串模式匹配

简单的字符串匹配

检验字符串pat是否在str中最简单但最低效的方法：逐个考虑str内每个位置，判断其是否是匹配的起始地址。
代码如下：

// 若匹配返回匹配起始地址，否则返回-1
int Find(const string &str, const string &pat)
{
	//获取字符串长度
	int lengthS = str.size();
	int lengthP = pat.size();

	for (int s = 0; s < lengthS; s++)
	{
		int j;
		for (j = 0; j < lengthP && str[s + j] == pat[j]; j++);
		//匹配到结果返回起始地址
		if (j == lengthP)return s;
	}
	//非匹配到结果
	return -1;
}

Knuth-Morris-Pratt算法

背景分析

上面提到的简单算法的时间复杂度为 $O(lengthS\times lengthP)$ ，能否有一种算法将时间复杂度控制在 $O (l e n g t h S + l e n g t h P)$ 。
为了实现这一想法，我们希望能无需回溯地在字符串中，即当发生一个不匹配（失配）的情况时，我们希望利用模式串（Pat）中不匹配字母和失配位置等相关信息确定继续搜索的位置。如图所示：
为了确定模式串的适配信息，我们给模式串定义一个失配函数。

失配函数

定义

设 $p=p_0p_1...p_{n-1}$ 是一个模式串，定义它的失配函数(failure function)， $f$ 为：

$\begin{cases} k,&k为满足p_0...p_k=p_{j-k}...p_j且k<j的最大整数，k存在且k\ge 0\\ -1, & 无满足k \end{cases}$

例如模式串 $p a t = a b c a b c a c a b$ ，有：

j	0	1	2	3	4	5	6	7	8	9
pat	a	b	c	a	b	c	a	c	a	b
f	-1	-1	-1	0	1	2	3	-1	0	1

实现方法

基于失配函数的另一种表达形式：

$\begin{cases} -1,&j=0\\ f^m(j-1)+1,&m是满足p_{f^k(j-1)+1}=p_j的整数k的最小值\\ -1,&其他情况 \end{cases}$
注意： $f^1(j)=f(j),f^m(j)=f(f^{m-1}(j))$

代码如下：

//失配函数
int* FailureFunction(const string &pat)
{
	int lengthP = pat.size();
	int* f = new int[lengthP];
	int j = 0, k = -1;

	f[0] = -1;
	while (j < lengthP - 1)
	{
		if (pat[j + 1] == pat[k + 1])
		{
			j++;
			k++;
			f[j] = k;
		}
		else if (k == -1)
		{
			j++;
			f[j] = -1;
		}
		else k = f[k];
	}
	//返回f[lengthP]指针
	return f;
}

函数分析

可知失配函数的时间复杂度为 $O (l e n g t h P)$ .

KMP函数

实现方法

设字符串（str的指针索引为 $P o s S$ ，模式串（pat的指针索引为 $P o s P$ ；
在匹配失败回溯时， $P o s S$ 不变， $PosP=f[PosP-1]+1(PosP\ne0)$ .
代码如下：

int KMP(const string& str, const string& pat)
{
	int LengthP = pat.size(), LengthS = str.size();
	int* f;

	//生成失配函数
	f = FailureFunction(pat);

	//查询过程
	int PosP = 0, PosS = 0;
	while (PosP < LengthP && PosS < LengthS)
	{
		if (pat[PosP] == str[PosS])
		{
			PosP++;
			PosS++;
		}
		else
		{
			if (PosP == 0)
				PosS++;
			else PosP = f[PosP - 1] + 1;
		}
	}
	//匹配到结果返回起始地址；
	//非匹配到结果返回-1
	return PosP < LengthP || LengthP == 0 ? -1 : PosS - PosP;
}

函数分析

该算法的时间复杂度为 $O (l e n g t h S + l e n g t h P)$ .

失配信息的另一种方法：next函数

定义

$\begin{cases} -1,&j=0\\ next^m(j)+1,&m是满足p_{next^k(j)}=p_j的整数k的最小值\\ 0,&next^m(j)=-1 \end{cases}$

实现方法

//失配函数2
int* Next(const string & pat)
{
	int lengthP = pat.size();
	int* next = new int[lengthP];
	int j = 0, k = -1;

	next[0] = -1;
	while (j < lengthP)
	{
		if (k == -1 || pat[j] == pat[k])
		{
			j++;
			k++;
			next[j] = k;
		}
		else k = next[k];
	}

	return next;
}

此时KMP算法如下：

int KMP(const string& str, const string& pat)
{
	int LengthP = pat.size(), LengthS = str.size();
	int* f;
	
	//生成失配函数2
	f = Next(pat);

	//查询过程
	int PosP = 0, PosS = 0;
	while (PosP < LengthP && PosS < LengthS)
	{
		if (pat[PosP] == str[PosS])
		{
			PosP++;
			PosS++;
		}
		else
		{
			if (PosP == 0)
				PosS++;
			else PosP = next[PosP];
		}
	}
	//匹配到结果返回起始地址；
	//非匹配到结果返回-1
	return PosP < LengthP || LengthP == 0 ? -1 : PosS - PosP;
}

代码总览

#include<iostream>
using namespace std;

// 若匹配返回匹配起始地址，否则返回-1
int Find(const string &str, const string &pat)
{
	//获取字符串长度
	int lengthS = str.size();
	int lengthP = pat.size();

	for (int s = 0; s < lengthS; s++)
	{
		int j;
		for (j = 0; j < lengthP && str[s + j] == pat[j]; j++);
		//匹配到结果返回起始地址
		if (j == lengthP)return s;
	}
	//非匹配到结果
	return -1;
}


//失配函数1
int* FailureFunction(const string &pat)
{
	int lengthP = pat.size();
	int* f = new int[lengthP];
	int j = 0, k = -1;

	f[0] = -1;
	while (j < lengthP - 1)
	{
		if (pat[j + 1] == pat[k + 1])
		{
			j++;
			k++;
			f[j] = k;
		}
		else if (k == -1)
		{
			j++;
			f[j] = -1;
		}
		else k = f[k];
	}
	//返回f[lengthP]指针
	return f;
}

//失配函数2
int* Next(const string & pat)
{
	int lengthP = pat.size();
	int* next = new int[lengthP];
	int j = 0, k = -1;

	next[0] = -1;
	while (j < lengthP)
	{
		if (k == -1 || pat[j] == pat[k])
		{
			j++;
			k++;
			next[j] = k;
		}
		else k = next[k];
	}

	return next;
}

int KMP(const string& str, const string& pat)
{
	int LengthP = pat.size(), LengthS = str.size();
	int* f;

	//生成失配函数1
	f = FailureFunction(pat);

	//生成失配函数2
	//f = Next(pat);

	//查询过程
	int PosP = 0, PosS = 0;
	while (PosP < LengthP && PosS < LengthS)
	{
		if (pat[PosP] == str[PosS])
		{
			PosP++;
			PosS++;
		}
		else
		{
			if (PosP == 0)
				PosS++;
			else PosP = f[PosP - 1] + 1;//PosP = next[PosP];
		}
	}
	//匹配到结果返回起始地址；
	//非匹配到结果返回-1
	return PosP < LengthP || LengthP == 0 ? -1 : PosS - PosP;
}

int main()
{
	string str = "abcasdababcabcacbddfhs", pat = "abcabcacbd";
	cout << KMP(str, pat) << endl;
}