C++常用算法及数据结构（字符串匹配算法）

最新推荐文章于 2024-05-09 20:18:18 发布

用户已升天

最新推荐文章于 2024-05-09 20:18:18 发布

阅读量2k

点赞数

文章标签： c++ 算法数据结构

本文链接：https://blog.csdn.net/ljq0704/article/details/131541751

版权

字符串匹配算法

字符串匹配算法是指在一个文本串（较长的字符串）中查找一个模式串（较短的字符串）的出现位置或匹配情况的算法。以下是几种常见的字符串匹配算法

1.朴素字符串匹配算法（Brute-Force算法）：

从文本串的首字母开始依次与模式串作比较，直到找到匹配或者比较到文本串末尾。时间复杂度为O(nm)，其中n为文本串长度，m为模式串长度。

示例代码：

#include <iostream>
#include <string>
using namespace std;
// 定义匹配结果结构体
struct MatchResult {
bool found; // 是否找到匹配
int startIndex; // 匹配起始索引
int endIndex; // 匹配结束索引
};
// 优化后的朴素字符串匹配算法函数
MatchResult bruteForceSearch(const string& text, const string& pattern) {
int n = text.length(); // 文本串的长度
int m = pattern.length(); // 模式串的长度
int startIndex = -1; // 匹配起始索引
int endIndex = -1; // 匹配结束索引
// 在文本串中滑动窗口
for (int i = 0; i <= n - m; i++) {
int j;
// 逐个字符比较
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j])
break;
}
// 找到完全匹配
if (j == m) {
startIndex = i; // 设置匹配起始索引
endIndex = i + m - 1; // 设置匹配结束索引
break;
}
}
// 构造匹配结果结构体
MatchResult result;
if (startIndex != -1) {
result.found = true;
result.startIndex = startIndex;
result.endIndex = endIndex;
} else {
result.found = false;
result.startIndex = -1;
result.endIndex = -1;
}
return result;
}
int main() {
string text = "ABCABDABEABF"; // 文本串
string pattern = "ABD"; // 模式串
MatchResult result = bruteForceSearch(text, pattern); // 调用优化后的朴素字符串匹配算法函数
// 根据匹配结果进行输出
if (result.found) {
cout << "从文本串的 " << result.startIndex << " 到 " << result.endIndex << endl;
} else {
cout << "未找到匹配的字符串" << endl;
}
return 0;
}

代码解析：

定义了一个名为 MatchResult 的结构体，用于存储匹配结果的相关信息。结构体包含三个成员变量：found 表示是否找到匹配，startIndex 表示匹配的起始索引，endIndex 表示匹配的结束索引。
定义了一个名为 bruteForceSearch 的函数，用于执行优化后的朴素字符串匹配算法。函数的输入参数为文本串 text 和模式串 pattern，返回一个 MatchResult 结构体，表示匹配结果。
在函数内部，我们首先获取文本串和模式串的长度，初始化匹配的起始索引 startIndex 和结束索引 endIndex 为 -1。
然后，我们使用两层循环：外层循环控制滑动窗口的起始位置，内层循环逐个字符比较文本串和模式串的对应字符。
如果在内层循环中找到了完全匹配，即内层循环正常结束（j == m），我们设置匹配的起始索引为当前滑动窗口的起始位置 i，结束索引为起始索引加上模式串的长度减1。
当找到完全匹配后，我们跳出外层循环（使用 break 语句）。
接下来，我们根据匹配的起始索引是否为 -1 分别设置 MatchResult 结构体的 found 字段为 true 或 false，并将匹配的索引值赋值给 startIndex 和 endIndex。
最后，我们返回构造好的 MatchResult 结构体给调用者。
在 main 函数中，我们定义了一个文本串 text 和一个模式串 pattern。
然后，我们调用优化后的朴素字符串匹配算法函数 bruteForceSearch 并将其返回的结果存储在 result 中。
最后，我们根据匹配结果的 found 字段进行判断输出。如果 found 为 true，表示找到了匹配，我们输出匹配的起始索引 startIndex 和结束索引 endIndex。如果 found 为 false，表示未找到匹配，我们输出相应的提示信息。
这样，以上代码实现了一个简单的朴素字符串匹配算法，并通过 MatchResult 结构体将匹配的结果返回给主函数以进行进一步处理和输出。
运行结果

2.KMP算法：

KMP算法利用模式串的特性，通过建立一个部分匹配表（Partial Match Table）来避免在模式串和文本串中的不必要的匹配和回溯。它的时间复杂度为O(n+m)，其中n为文本串长度，m为模式串长度。

示例代码：

#include <iostream>
#include <string>
#include <vector>
using namespace std;
vector<int> getFailureFunction(const string& pattern) {
int m = pattern.length(); // 模式串的长度
vector<int> failure(m, 0); // 部分匹配表
int j = 0;
for (int i = 1; i < m; i++) {
if (pattern[i] == pattern[j]) {
failure[i] = j + 1;
j++;
} else {
if (j != 0) {
j = failure[j - 1];
i--;
} else {
failure[i] = 0;
}
}
}
return failure;
}
vector<int> kmpSearch(const string& text, const vector<string>& patterns) {
int n = text.length(); // 文本串的长度
vector<int> positions; // 存储匹配的位置信息
for (const string& pattern : patterns) {
int m = pattern.length(); // 模式串的长度
vector<int> failure = getFailureFunction(pattern); // 构建部分匹配表
int i = 0, j = 0;
while (i < n) {
if (text[i] == pattern[j]) {
if (j == m - 1) { // 完全匹配
positions.push_back(i - j); // 存储匹配的起始索引
break;
} else {
i++;
j++;
}
} else {
if (j != 0) { // 部分匹配，根据部分匹配表来移动模式串
j = failure[j - 1];
} else {
i++;
}
}
}
}
return positions;
}
int main() {
string text = "ABCABDABEABF"; // 文本串
vector<string> patterns = {"ABD", "BCA", "ABE"}; // 多个模式串
vector<int> positions = kmpSearch(text, patterns); //
if (positions.empty()) {
cout << "未找到匹配的字符串" << endl; // 未找到匹配
} else {
for (int i : positions) {
cout << "从文本串的" << i << " 到 " << i + patterns[0].length() - 1 << endl;
// 打印匹配的起始索引和结束索引（这里假设所有模式串的长度相同）
}
}
return 0;
}

代码解析：

getFailureFunction 函数实现了构建模式串的部分匹配表。该表存储了每个位置字符前缀和后缀相等的最长部分匹配长度。使用双指针 i 和 j 遍历模式串，根据当前字符是否匹配进行更新。
kmpSearch 函数实现了多个模式串的搜索。对于每个模式串，首先获取其长度 m，然后构建部分匹配表。使用双指针 i 和 j 在文本串中进行匹配的遍历。如果当前字符匹配，继续向后匹配，直到完全匹配。如果不匹配，根据部分匹配表移动模式串的指针。一旦找到完全匹配，将匹配的起始索引保存到 positions 中。
在 main 函数中，定义了文本串 text 和多个模式串的向量 patterns，分别表示要搜索的文本和模式串。调用 kmpSearch 函数获取匹配的位置信息，并将结果保存到 positions 中。

运行结果

小拓展：
1. for (int i : positions) {
2. cout << "从文本串的" << i << " 到 " << i + patterns[0].length() - 1 << endl;
上面的代码是C++11中的foreach循环。
其基本的结构为
1. for (element : container) {
2. // 循环体，对每个 element 进行操作
3. }
其中，container 是一个可迭代的容器（如 vector、list、array 等），element 是容器中的每个元素。
在循环的每次迭代中，element 会依次被赋值为容器中的每个元素，并执行循环体内的操作。

下面是简单的例子：
1. vector<int> numbers = {1, 2, 3, 4, 5};
2. for (int num : numbers) {
3. cout << num << " ";
4. //输出结果为1 2 3 4 5
5. }
3.Rabin-Karp算法：

Rabin-Karp算法通过哈希函数对模式串和文本串中的子串分别进行哈希，然后比较哈希值来判断是否匹配。它的时间复杂度为O(n+m)，但在最坏情况下可能达到O(nm)，其中n为文本串长度，m为模式串长度。

示例代码：
1. #include <iostream>
2. #include <string>
3. #include <vector>
4. using namespace std;
5. const int prime = 101; // 用于计算哈希值的质数
6. vector<pair<int, int>> rabinKarpSearch(const string& text, const string& pattern) {
7. int n = text.length(); // 文本串的长度
8. int m = pattern.length(); // 模式串的长度
9. vector<pair<int, int>> positions; // 存储匹配位置信息
10. int patternHash = 0; // 模式串的哈希值
11. int windowHash = 0; // 窗口的哈希值
12. int power = 1; // 用于计算哈希值的幂次
13. // 计算模式串的哈希值和初始窗口的哈希值
14. for (int i = 0; i < m; i++) {
15. patternHash = (patternHash * prime + pattern[i]) % prime;
16. windowHash = (windowHash * prime + text[i]) % prime;
17. if (i != 0) {
18. power = (power * prime) % prime; // 计算 prime^(m-1)
19. }
20. }
21. // 在文本串中滑动窗口并比较哈希值
22. for (int i = 0; i <= n - m; i++) {
23. // 如果哈希值匹配并且窗口内的字符串与模式串相同，则找到匹配
24. if (patternHash == windowHash && text.substr(i, m) == pattern) {
25. positions.push_back(make_pair(i, i + m - 1));
26. }
27. // 滚动更新窗口哈希值
28. if (i < n - m) {
29. windowHash = (prime * (windowHash - text[i] * power) + text[i + m]) % prime;
30. if (windowHash < 0) {
31. windowHash += prime; // 处理负数情况
32. }
33. }
34. }
35. return positions;
36. }
37. int main() {
38. string text = "ABCABDABEABF"; // 文本串
39. string pattern = "ABD"; // 模式串
41. vector<pair<int, int>> positions = rabinKarpSearch(text, pattern);
43. if (positions.empty()) {
44. cout << "未找到匹配的字符串" << endl; // 未找到匹配
45. } else {
46. for (auto pos : positions) {
47. cout << "文本串的 " << pos.first << "到 " << pos.second << endl;
48. }
49. }
51. return 0;
52. }
代码解析：

在rabinKarpSearch函数中，首先计算模式串的哈希值和初始窗口的哈希值。然后，在文本串中滑动窗口，比较窗口的哈希值与模式串的哈希值是否相等，以及窗口内的字符串与模式串是否相等。如果相等，则将匹配的起始索引和结束索引存储到positions向量中。

在main函数中，定义了一个文本串text和一个模式串pattern。然后调用rabinKarpSearch函数进行搜索，并将返回的匹配位置存储到positions向量中。如果没有找到匹配的字符串，输出提示信息”未找到匹配的字符串”。如果找到匹配的字符串，遍历positions向量并输出每一个匹配的起始索引和结束索引。
运行结果同上（想偷懒）

4.Aho-Corasick算法：

Aho-Corasick算法是一种多模式匹配算法，用于在一个文本串中同时匹配多个模式串。它利用自动机的思想构建一个Trie树，并通过状态转移和失配函数来处理匹配过程。它的时间复杂度为O(n+m+k)，其中n为文本串长度，m为模式串平均长度，k为模式串数量。

示例代码：（写的很烂）

#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
struct TrieNode {
vector<TrieNode*> children;
vector<int> patterns;
TrieNode* fail;
TrieNode* parent;
char ch;
TrieNode(TrieNode* parent, char ch) {
children.resize(26, nullptr);
this->fail = nullptr;
this->parent = parent;
this->ch = ch;
}
};
class AhoCorasick {
public:
AhoCorasick(const vector<string>& patterns) {
root = new TrieNode(nullptr, '\0');
this->patterns = patterns;
buildTrie(this->patterns);
buildFailPointers();
}
vector<pair<int, int>> search(const string& text) {
vector<pair<int, int>> positions;
TrieNode* currNode = root;
for (int i = 0; i < text.length(); i++) {
char ch = text[i];
while (currNode != root && currNode->children[ch - 'a'] == nullptr) {
currNode = currNode->fail;
}
if (currNode->children[ch - 'a'] != nullptr) {
currNode = currNode->children[ch - 'a'];
}
TrieNode* tempNode = currNode;
while (tempNode != root) {
for (int patternIndex : tempNode->patterns) {
int startPos = i - this->patterns[patternIndex].length() + 1;
int endPos = i;
positions.push_back(make_pair(startPos, endPos));
}
tempNode = tempNode->fail;
}
}
return positions;
}
private:
TrieNode* root;
vector<string> patterns;
void buildTrie(const vector<string>& patterns) {
for (int i = 0; i < patterns.size(); i++) {
insertPattern(patterns[i], i);
}
}
void insertPattern(const string& pattern, int patternIndex) {
TrieNode* currNode = root;
for (char ch : pattern) {
if (currNode->children[ch - 'a'] == nullptr) {
TrieNode* newNode = new TrieNode(currNode, ch);
currNode->children[ch - 'a'] = newNode;
}
currNode = currNode->children[ch - 'a'];
}
currNode->patterns.push_back(patternIndex);
}
void buildFailPointers() {
root->fail = root;
vector<TrieNode*> nodeQueue;
for (int i = 0; i < 26; i++) {
if (root->children[i] != nullptr) {
root->children[i]->fail = root;
nodeQueue.push_back(root->children[i]);
}
}
while (!nodeQueue.empty()) {
TrieNode* currNode = nodeQueue.back();
nodeQueue.pop_back();
for (int i = 0; i < 26; i++) {
if (currNode->children[i] != nullptr) {
TrieNode* childNode = currNode->children[i];
nodeQueue.push_back(childNode);
TrieNode* failNode = currNode->fail;
while (failNode != root && failNode->children[i] == nullptr) {
failNode = failNode->fail;
}
if (failNode->children[i] != nullptr) {
failNode = failNode->children[i];
}
childNode->fail = failNode;
}
}
}
}
};
int main() {
vector<string> patterns = { "he", "she", "his", "hers" };
AhoCorasick ahoCorasick(patterns);
string text = "ushershehis";
vector<pair<int, int>> positions = ahoCorasick.search(text);
if (positions.empty()) {
cout << "No patterns found" << endl;
} else {
for (auto pos : positions) {
cout << "Pattern found at positions: " << pos.first << " - " << pos.second << endl;
}
}
return 0;
}

用户已升天

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
C++常用算法及数据结构（字符串匹配算法）

然后，在文本串中滑动窗口，比较窗口的哈希值与模式串的哈希值是否相等，以及窗口内的字符串与模式串是否相等。它的时间复杂度为O(n+m+k)，其中n为文本串长度，m为模式串平均长度，k为模式串数量。KMP算法利用模式串的特性，通过建立一个部分匹配表（Partial Match Table）来避免在模式串和文本串中的不必要的匹配和回溯。它的时间复杂度为O(n+m)，其中n为文本串长度，m为模式串长度。字符串匹配算法是指在一个文本串（较长的字符串）中查找一个模式串（较短的字符串）的出现位置或匹配情况的算法。
复制链接

扫一扫