设S为源串,P为模式串——字符串匹配指的是S是否包含P
m = S.size();
n = P.size();
朴素匹配法: O(m*n)
对于S中下标在【0,m-n】的字符,逐个向后遍历并于P进行比较。
复杂度较高。
// 朴素匹配法: O(m*n)
void Match(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if (n == 0 || m < n) return; // 处理特殊情况
for (int i = 0; i <= m - n; i++) {
int j = 0;
while (j < n && S[i + j] == P[j]) {
j++;
}
if (j == n) { // 完全匹配
cout << "Pattern found at index " << i << endl;
}
}
}
哈希匹配法: O(m*n)
利用进制的思想将字符串转换到实数域,不同的字符串对应不同的hash值。计算所有符合条件的hash值,并于P的hash值比较即可。
复杂度较高。并且有可能产生哈希冲突,需要特殊判断处理。
// Hash转换
long Hash(const string& s) {
long hash = 0;
for (char ch : s) {
hash = (hash * seed + ch) % mod; // 每次都取模
}
return hash;
}
// Hash匹配法: O(m*n)
void HashMatch(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if (n == 0 || m < n) return; // 处理特殊情况
long hash_p = Hash(P);
long hash_s[Size] = { 0 };
for (int i = 0; i + n <= m; i++) {
hash_s[i] = Hash(S.substr(i, n));
}
for (int i = 0; i <= m - n; i++) {
if (hash_p == hash_s[i]) {
// 再检查一次确保哈希碰撞不误报
if (S.substr(i, n) == P) {
cout << "Pattern found at index " << i << endl;
}
}
}
}
Robin-Karp(滚动哈希优化): O(m+n)
哈希法的优化版,利用滑动窗口的思想,将对于每个hash值的更新复杂度降到常数级:n->2,即减去第一个字符的值,加上后一个字符的值。
复杂度较低,也存在hash冲突,需特殊处理。
// Robin-Karp算法: O(m+n) 使用滚动数组优化
void HashMatchPlus(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if (n == 0 || m < n) return; // 处理特殊情况
long hash_p = Hash(P);
long hash_s = Hash(S.substr(0, n));
long seed_n = 1; // seed^n % mod
// 预计算seed^n (可以用快速幂优化)——见快速幂算法
for (int i = 0; i < n; i++) {
seed_n = (seed_n * seed) % mod;
}
for (int i = 0; i <= m - n; i++) {
if (hash_p == hash_s) { // 再来一次朴素匹配
if (S.substr(i, n) == P) {
cout << "Pattern found at index " << i << endl;
}
}
if (i < m - n) {
hash_s = (hash_s * seed + S[i + n] - S[i] * seed_n) % mod; // 更新哈希
if (hash_s < 0) {
hash_s += mod; // 确保哈希值为正
}
}
}
}
KMP: O(m+n)
主要在于next数组的计算,有点类似于预处理。next只与P有关,和S无关。
复杂度较低。
//计算next数组:
vector<int> Next(const string& P) {
int n = P.size();
if (n == 0) return vector<int>{-1};
if (n == 1) return vector<int>{-1, 0};
vector<int>next(n, 0);
next[0]= -1;
next[1]= 0;
int j = 1, k = next[j];
while (j < n - 1) {
if (k < 0 || P[j] == P[k]) {
next[++j] = ++k;
}
else {
k = next[k];
}
}
return next;
}
// KMP算法: O(m+n)
void KMP(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if(m<n || n==0 || m==0) return;
vector<int> next=Next(P);
int i=0,j=0;
while (i < m) {
if (j<0 || S[i] == P[j]) {
i++;
j++;
}
else {
j = next[j];
}
if (j == n) {
cout << "Pattern found at index " << i-j << endl;
i--;
j = next[j-1];
}
}
}
后缀数组法:大概O( n 2 n^2 n2logn) 未优化
- 求出源串的所有后缀数组
- 对后缀数组按字典序排序,同时附带上后缀数组元素的下标(这里用类来封装,也可以用结构体),便于二分查找匹配位置
- 使用二分查找模式串在已序后缀数组中的位置,找到后退出。
- 以mid为中心向前后线性遍历,找到所以匹配位置
//5.后缀数组法:O(n^2logn)
class Suffix {
public:
string suff;
int index;
Suffix(string suff, int index) : suff(suff), index(index) {}
//比较函数,进行字典序排序
static bool cmp(Suffix a, Suffix b) {
return a.suff < b.suff;
}
//构建后缀数组
static vector<Suffix> buildSuffixArray(string s, int n)
{
vector<Suffix> suffixArray;
for (int i = 0; i < n; i++) {
suffixArray.push_back(Suffix(s.substr(i), i));
}
sort(suffixArray.begin(), suffixArray.end(), cmp);
for(int i = 0; i < n; i++)
cout << suffixArray[i].suff << " " << suffixArray[i].index << endl;
cout<<endl;
return suffixArray;
}
//使用二分查找搜索目标字符串
static void search(string pat, string txt, vector<Suffix> suffixArray)
{
int m = pat.size();
int n = txt.size();
int l = 0, r = n - 1, mid = 0;
bool found = false;
while (l <= r)
{
mid = l + (r - l) / 2;
string substr = suffixArray[mid].suff.substr(0, m);
if (substr == pat) {
break;
}
else if (substr < pat)
l = mid + 1;
else
r = mid - 1;
}
cout << "Pattern found at index " << suffixArray[mid].index << endl;
int cnt = mid;
while (cnt > 0 && suffixArray[cnt - 1].suff.substr(0, m) == pat) {
cout << "Pattern found at index " << suffixArray[--cnt].index << endl;
}
cnt = mid;
while (cnt < n - 1 && suffixArray[cnt + 1].suff.substr(0, m) == pat) {
cout << "Pattern found at index " << suffixArray[++cnt].index << endl;
}
}
};
后缀数组优化:使用倍增法优化O( n l o g 2 n nlog^2n nlog2n)
和后缀数组的操作类似,不过免去了存储后缀字符串的方式,改用存储原后缀下标,并且采用倍增法进行排序,降低了时间复杂度。
//定义后缀数组类
class SuffixArray {
public:
SuffixArray(const string& S) : S(S), n(S.size()) {
sa = Cal_SA();
}
// 计算后缀数组
vector<int> Cal_SA() {
vector<int> rank(n), temp_rank(n), sa(n);
// 初始化后缀数组
for (int i = 0; i < n; ++i) {
sa[i] = i;
rank[i] = S[i];
}
// K 是当前考虑的字符对的个数
for (int k = 1; k < n; k *= 2) {
// 根据 rank 排序后缀数组
auto cmp = [this, &rank, k](int a, int b) {
if (rank[a] != rank[b]) return rank[a] < rank[b];
int ra = (a + k < n) ? rank[a + k] : -1;
int rb = (b + k < n) ? rank[b + k] : -1;
return ra < rb;
};
sort(sa.begin(), sa.end(), cmp);
// 生成新秩值
temp_rank[sa[0]] = 0;
for (int i = 1; i < n; ++i) {
temp_rank[sa[i]] = temp_rank[sa[i - 1]] + (cmp(sa[i - 1], sa[i]) ? 1 : 0);
}
swap(rank, temp_rank);
}
for (auto i : sa)
cout << i << " ";
cout << endl;
return sa;
}
// 后缀数组匹配法
void SA_Match(const string& P) {
int m = P.size();
if (m == 0) return;
// 对排序后的SA二分查找
int left = 0, right = n-1,mid=0;
while (left <= right) {
mid = left + (right - left) / 2;
if (S.compare(sa[mid], m, P) < 0)
left = mid + 1;
else if (S.compare(sa[mid], m, P) > 0)
right = mid - 1;
else break;
}
cout<< "Pattern found at index " << sa[mid] << endl;
int cnt = mid;
while (cnt > 0 && S.compare(sa[cnt - 1], m, P) == 0) {
cout<< "Pattern found at index " << sa[--cnt] << endl;
}
cnt=mid;
while (cnt < n - 1 && S.compare(sa[cnt + 1], m, P) == 0) {
cout << "Pattern found at index " << sa[++cnt] << endl;
}
}
private:
string S;
int n;
vector<int> sa;
};
总代码如下:
#include <iostream>
#include <string>
#include <vector>
#include <cmath>
#include <algorithm>
#define seed 31
#define mod 1000000007
#define Size 1000
using namespace std;
// Hash转换
long Hash(const string& s) {
long hash = 0;
for (char ch : s) {
hash = (hash * seed + ch) % mod; // 每次都取模
}
return hash;
}
//计算next数组:
vector<int> Next(const string& P) {
int n = P.size();
if (n == 0) return vector<int>{-1};
if (n == 1) return vector<int>{-1, 0};
vector<int>next(n, 0);
next[0] = -1;
next[1] = 0;
int j = 1, k = next[j];
while (j < n - 1) {
if (k < 0 || P[j] == P[k]) {
next[++j] = ++k;
}
else {
k = next[k];
}
}
return next;
}
// 1.朴素匹配法: O(m*n)
void Match(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if (n == 0 || m < n) return; // 处理特殊情况
for (int i = 0; i <= m - n; i++) {
int j = 0;
while (j < n && S[i + j] == P[j]) {
j++;
}
if (j == n) { // 完全匹配
cout << "Pattern found at index " << i << endl;
}
}
}
// 2.Hash匹配法: O(m*n)
void HashMatch(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if (n == 0 || m < n) return; // 处理特殊情况
long hash_p = Hash(P);
long hash_s[Size] = { 0 };
for (int i = 0; i + n <= m; i++) {
hash_s[i] = Hash(S.substr(i, n));
}
for (int i = 0; i <= m - n; i++) {
if (hash_p == hash_s[i]) {
// 再检查一次确保哈希碰撞不误报
if (S.substr(i, n) == P) {
cout << "Pattern found at index " << i << endl;
}
}
}
}
// 3.Robin-Karp算法: O(m+n) 使用滚动数组优化
void HashMatchPlus(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if (n == 0 || m < n) return; // 处理特殊情况
long hash_p = Hash(P);
long hash_s = Hash(S.substr(0, n));
long seed_n = 1; // seed^n % mod
// 预计算seed^n (可以用快速幂优化)——见快速幂算法
for (int i = 0; i < n; i++) {
seed_n = (seed_n * seed) % mod;
}
for (int i = 0; i <= m - n; i++) {
if (hash_p == hash_s) { // 再来一次朴素匹配
if (S.substr(i, n) == P) {
cout << "Pattern found at index " << i << endl;
}
}
if (i < m - n) {
hash_s = (hash_s * seed + S[i + n] - S[i] * seed_n) % mod; // 更新哈希
if (hash_s < 0) {
hash_s += mod; // 确保哈希值为正
}
}
}
}
// 4.KMP算法: O(m+n)
void KMP(const string& S, const string& P) {
int m = S.size();
int n = P.size();
if (m < n || n == 0 || m == 0) return;
vector<int> next = Next(P);
int i = 0, j = 0;
while (i < m) {
if (j < 0 || S[i] == P[j]) {
i++;
j++;
}
else {
j = next[j];
}
if (j == n) {
cout << "Pattern found at index " << i - j << endl;
i--;
j = next[j - 1];
}
}
}
//5.后缀数组法:O(n^2logn)
class Suffix {
public:
string suff;
int index;
Suffix(string suff, int index) : suff(suff), index(index) {}
//比较函数,进行字典序排序
static bool cmp(Suffix a, Suffix b) {
return a.suff < b.suff;
}
//构建后缀数组
static vector<Suffix> buildSuffixArray(string s, int n)
{
vector<Suffix> suffixArray;
for (int i = 0; i < n; i++) {
suffixArray.push_back(Suffix(s.substr(i), i));
}
sort(suffixArray.begin(), suffixArray.end(), cmp);
for(int i = 0; i < n; i++)
cout << suffixArray[i].suff << " " << suffixArray[i].index << endl;
cout<<endl;
return suffixArray;
}
//使用二分查找搜索目标字符串
static void search(string pat, string txt, vector<Suffix> suffixArray)
{
int m = pat.size();
int n = txt.size();
int l = 0, r = n - 1, mid = 0;
bool found = false;
while (l <= r)
{
mid = l + (r - l) / 2;
string substr = suffixArray[mid].suff.substr(0, m);
if (substr == pat) {
break;
}
else if (substr < pat)
l = mid + 1;
else
r = mid - 1;
}
cout << "Pattern found at index " << suffixArray[mid].index << endl;
int cnt = mid;
while (cnt > 0 && suffixArray[cnt - 1].suff.substr(0, m) == pat) {
cout << "Pattern found at index " << suffixArray[--cnt].index << endl;
}
cnt = mid;
while (cnt < n - 1 && suffixArray[cnt + 1].suff.substr(0, m) == pat) {
cout << "Pattern found at index " << suffixArray[++cnt].index << endl;
}
}
};
//6.后缀(Suffix)数组优化: O(n(logn)^2)
class SuffixArray {
public:
SuffixArray(const string& S) : S(S), n(S.size()) {
sa = Cal_SA();
}
// Calculate suffix array
vector<int> Cal_SA() {
vector<int> rank(n), temp_rank(n), sa(n);
// Initialize suffix array
for (int i = 0; i < n; ++i) {
sa[i] = i;
rank[i] = S[i];
}
// K is the current number of character pairs
for (int k = 1; k < n; k *= 2) {
// Sort the suffix array based on rank
auto cmp = [this, &rank, k](int a, int b) {
if (rank[a] != rank[b]) return rank[a] < rank[b];
int ra = (a + k < n) ? rank[a + k] : -1;
int rb = (b + k < n) ? rank[b + k] : -1;
return ra < rb;
};
sort(sa.begin(), sa.end(), cmp);
// Generate new rank
temp_rank[sa[0]] = 0;
for (int i = 1; i < n; ++i) {
temp_rank[sa[i]] = temp_rank[sa[i - 1]] + (cmp(sa[i - 1], sa[i]) ? 1 : 0);
}
swap(rank, temp_rank);
// Early termination
//if (rank[sa[n - 1]] == n - 1) break;
/*//output rank:
cout << "rank: ";
for (int i = 0; i < n; i++) {
cout << rank[i] << " ";
}
cout << endl;
*/
}
return sa;
}
// Suffix array match function
void SA_Match(const string& P) {
int m = P.size();
if (m == 0) return;
// Binary search on sorted SA
int left = 0, right = n - 1, mid = 0;
while (left <= right) {
mid = left + (right - left) / 2;
if (S.compare(sa[mid], m, P) < 0)
left = mid + 1;
else if (S.compare(sa[mid], m, P) > 0)
right = mid - 1;
else break;
}
cout << "Pattern found at index " << sa[mid] << endl;
int cnt = mid;
while (cnt > 0 && S.compare(sa[cnt - 1], m, P) == 0) {
cout << "Pattern found at index " << sa[--cnt] << endl;
}
cnt = mid;
while (cnt < n - 1 && S.compare(sa[cnt + 1], m, P) == 0) {
cout << "Pattern found at index " << sa[++cnt] << endl;
}
}
vector<int> getSA() const { return sa; } // Added getter for SA
private:
string S;
int n;
vector<int> sa;
};
//后缀数组的最长公共前缀(LCP)数组
class LCPArray {
public:
LCPArray(const string& S, const vector<int>& _sa) : S(S), sa(_sa), n(S.size()) {
lcp = Cal_LCP();
}
// Calculate LCP array
vector<int> Cal_LCP() {
vector<int> rank(n), lcp(n);
for (int i = 0; i < n; ++i) rank[sa[i]] = i;
int h = 0;
for (int i = 0; i < n; ++i) {
if (rank[i] == 0) continue;
int j = sa[rank[i] - 1];
if (h > 0) --h;
while (i + h < n && j + h < n && S[i + h] == S[j + h]) {
h++;
}
lcp[rank[i]] = h;
}
return lcp;
}
// Output LCP array
void Print_LCP() {
for (auto i : lcp)
cout << i << " ";
cout << endl;
}
private:
string S;
vector<int> sa;
int n;
vector<int> lcp;
};
int main() {
string S = "abcabc";
string P = "bc";
//朴素匹配:时间复杂度O(n*m)
cout << "Match: " << endl;
Match(S, P);
//Hash匹配:时间复杂度O(m*n)
cout << "HashMatch: " << endl;
HashMatch(S, P);
//Hash匹配优化:时间复杂度O(n+m)
cout << "HashMatchPlus: " << endl;
HashMatchPlus(S, P);
//KMP匹配:时间复杂度O(n+m)
cout << "KMP: " << endl;
KMP(S, P);
//后缀数组匹配:时间复杂度O(n^2 lgn)
cout << "SA: " << endl;
vector<Suffix>suffixArray = Suffix::buildSuffixArray(S, S.size());
Suffix::search(P, S, suffixArray);
//后缀数组倍增法优化:时间复杂度O(n lgn)
cout << "SAO: " << endl;
SuffixArray sa(S);
sa.SA_Match(P);
//后缀数组的最长公共前缀(LCP)数组
cout << "LCP: " << endl;
LCPArray lcpArray(S, sa.getSA());
lcpArray.Print_LCP();
return 0;
}