字符串匹配算法（含Lcb）

追逐远方的梦

已于 2024-09-30 16:48:02 修改

阅读量249

点赞数 6

分类专栏： C语言疑难杂症文章标签：算法哈希算法 c语言

于 2024-09-26 23:51:03 首次发布

本文链接：https://blog.csdn.net/weixin_73453526/article/details/142580165

版权

C语言疑难杂症专栏收录该内容

27 篇文章

订阅专栏

设S为源串，P为模式串——字符串匹配指的是S是否包含P

m = S.size();
n = P.size();

朴素匹配法： O(m*n)

对于S中下标在【0，m-n】的字符，逐个向后遍历并于P进行比较。
复杂度较高。


// 朴素匹配法: O(m*n)  
void Match(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if (n == 0 || m < n) return; // 处理特殊情况  

    for (int i = 0; i <= m - n; i++) {
        int j = 0;
        while (j < n && S[i + j] == P[j]) {
            j++;
        }
        if (j == n) { // 完全匹配  
            cout << "Pattern found at index " << i << endl;
        }
    }
}

哈希匹配法： O(m*n)

利用进制的思想将字符串转换到实数域，不同的字符串对应不同的hash值。计算所有符合条件的hash值，并于P的hash值比较即可。
复杂度较高。并且有可能产生哈希冲突，需要特殊判断处理。

// Hash转换  
long Hash(const string& s) {
    long hash = 0;
    for (char ch : s) {
        hash = (hash * seed + ch) % mod; // 每次都取模  
    }
    return hash;
}



// Hash匹配法: O(m*n)  
void HashMatch(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if (n == 0 || m < n) return; // 处理特殊情况  

    long hash_p = Hash(P);
    long hash_s[Size] = { 0 };

    for (int i = 0; i + n <= m; i++) {
        hash_s[i] = Hash(S.substr(i, n));
    }

    for (int i = 0; i <= m - n; i++) {
        if (hash_p == hash_s[i]) {
            // 再检查一次确保哈希碰撞不误报  
            if (S.substr(i, n) == P) {
                cout << "Pattern found at index " << i << endl;
            }
        }
    }
}

Robin-Karp（滚动哈希优化）： O(m+n)

哈希法的优化版，利用滑动窗口的思想，将对于每个hash值的更新复杂度降到常数级：n->2，即减去第一个字符的值，加上后一个字符的值。
复杂度较低，也存在hash冲突，需特殊处理。

// Robin-Karp算法: O(m+n) 使用滚动数组优化 
void HashMatchPlus(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if (n == 0 || m < n) return; // 处理特殊情况  

    long hash_p = Hash(P);
    long hash_s = Hash(S.substr(0, n));
    long seed_n = 1; // seed^n % mod  

    // 预计算seed^n  （可以用快速幂优化）——见快速幂算法
    for (int i = 0; i < n; i++) {
        seed_n = (seed_n * seed) % mod;
    }

    for (int i = 0; i <= m - n; i++) {
        if (hash_p == hash_s) { // 再来一次朴素匹配  
            if (S.substr(i, n) == P) {
                cout << "Pattern found at index " << i << endl;
            }
        }
        if (i < m - n) {
            hash_s = (hash_s * seed + S[i + n] - S[i] * seed_n) % mod; // 更新哈希  
            if (hash_s < 0) {
                hash_s += mod; // 确保哈希值为正  
            }
        }
    }
}

KMP： O(m+n)

主要在于next数组的计算，有点类似于预处理。next只与P有关，和S无关。
复杂度较低。

//计算next数组：
vector<int> Next(const string& P) {
    int n = P.size();
    if (n == 0) return vector<int>{-1};
    if (n == 1) return vector<int>{-1, 0};
    
    vector<int>next(n, 0);
    next[0]= -1;
    next[1]= 0;

    int j = 1, k = next[j];

    while (j < n - 1) {
        if (k < 0 || P[j] == P[k]) {
            next[++j] = ++k;
        }
        else {
			k = next[k];
		}
    }

    return next; 
}

// KMP算法: O(m+n)
void KMP(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if(m<n || n==0 || m==0) return;

    vector<int> next=Next(P);

    int i=0,j=0;

    while (i < m) {
        if (j<0 || S[i] == P[j]) {
            i++;
            j++;
        }
        else {
            j = next[j];
        }

        if (j == n) {
			cout << "Pattern found at index " << i-j << endl;
            i--;
			j = next[j-1];
		}
    }
}

后缀数组法：大概O( $n^2$ logn) 未优化

求出源串的所有后缀数组
对后缀数组按字典序排序，同时附带上后缀数组元素的下标（这里用类来封装，也可以用结构体），便于二分查找匹配位置
使用二分查找模式串在已序后缀数组中的位置，找到后退出。
以mid为中心向前后线性遍历，找到所以匹配位置

//5.后缀数组法：O(n^2logn)
class Suffix {
public:
    string suff;
    int index;
    Suffix(string suff, int index) : suff(suff), index(index) {}


    //比较函数，进行字典序排序
    static bool cmp(Suffix a, Suffix b) {
        return a.suff < b.suff;
    }

    //构建后缀数组
    static vector<Suffix> buildSuffixArray(string s, int n)
    {
        vector<Suffix> suffixArray;

        for (int i = 0; i < n; i++) {
            suffixArray.push_back(Suffix(s.substr(i), i));
        }

        sort(suffixArray.begin(), suffixArray.end(), cmp);

        for(int i = 0; i < n; i++)
			cout << suffixArray[i].suff << " " << suffixArray[i].index << endl;
        cout<<endl;

        return suffixArray;
    }

    //使用二分查找搜索目标字符串
    static void search(string pat, string txt, vector<Suffix> suffixArray)
    {
        int m = pat.size();
        int n = txt.size();

        int l = 0, r = n - 1, mid = 0;

        bool found = false;
        while (l <= r)
        {
            mid = l + (r - l) / 2;

            string substr = suffixArray[mid].suff.substr(0, m);

            if (substr == pat) {
                break;
            }
            else if (substr < pat)
                l = mid + 1;
            else
                r = mid - 1;
        }


        cout << "Pattern found at index " << suffixArray[mid].index << endl;

        int cnt = mid;
        while (cnt > 0 && suffixArray[cnt - 1].suff.substr(0, m) == pat) {
            cout << "Pattern found at index " << suffixArray[--cnt].index << endl;
        }

        cnt = mid;
        while (cnt < n - 1 && suffixArray[cnt + 1].suff.substr(0, m) == pat) {
            cout << "Pattern found at index " << suffixArray[++cnt].index << endl;
        }
    }
};

后缀数组优化：使用倍增法优化O( $nlog^2n$ )

和后缀数组的操作类似，不过免去了存储后缀字符串的方式，改用存储原后缀下标，并且采用倍增法进行排序，降低了时间复杂度。

//定义后缀数组类
class SuffixArray {
public:
    SuffixArray(const string& S) : S(S), n(S.size()) {
        sa = Cal_SA();
    }

    // 计算后缀数组  
    vector<int> Cal_SA() {
        vector<int> rank(n), temp_rank(n), sa(n);

        // 初始化后缀数组  
        for (int i = 0; i < n; ++i) {
            sa[i] = i;
            rank[i] = S[i];
        }

        // K 是当前考虑的字符对的个数  
        for (int k = 1; k < n; k *= 2) {
            // 根据 rank 排序后缀数组  
            auto cmp = [this, &rank, k](int a, int b) {
                if (rank[a] != rank[b]) return rank[a] < rank[b];
                int ra = (a + k < n) ? rank[a + k] : -1;
                int rb = (b + k < n) ? rank[b + k] : -1;
                return ra < rb;
            };

            sort(sa.begin(), sa.end(), cmp);

            // 生成新秩值  
            temp_rank[sa[0]] = 0;
            for (int i = 1; i < n; ++i) {
                temp_rank[sa[i]] = temp_rank[sa[i - 1]] + (cmp(sa[i - 1], sa[i]) ? 1 : 0);
            }
            swap(rank, temp_rank);
        }
        for (auto i : sa)
            cout << i << " ";
        cout << endl;

        return sa;
    }

    // 后缀数组匹配法  
    void SA_Match(const string& P) {
        int m = P.size();
        if (m == 0) return;

        // 对排序后的SA二分查找
        int left = 0, right = n-1,mid=0;
        while (left <= right) {
            mid = left + (right - left) / 2;
            if (S.compare(sa[mid], m, P) < 0)
                left = mid + 1;
            else if (S.compare(sa[mid], m, P) > 0)
                right = mid - 1;
            else break;
        }

        cout<< "Pattern found at index " << sa[mid] << endl;

        int cnt = mid;
        while (cnt > 0 && S.compare(sa[cnt - 1], m, P) == 0) {
			cout<< "Pattern found at index " << sa[--cnt] << endl;
		}

        cnt=mid;
        while (cnt < n - 1 && S.compare(sa[cnt + 1], m, P) == 0) {
            cout << "Pattern found at index " << sa[++cnt] << endl;
        }

    }

private:
    string S;
    int n;
    vector<int> sa;
};

总代码如下：

#include <iostream>  
#include <string>  
#include <vector>
#include <cmath>  
#include <algorithm>

#define seed 31  
#define mod 1000000007  
#define Size 1000
using namespace std;

// Hash转换  
long Hash(const string& s) {
    long hash = 0;
    for (char ch : s) {
        hash = (hash * seed + ch) % mod; // 每次都取模  
    }
    return hash;
}

//计算next数组：
vector<int> Next(const string& P) {
    int n = P.size();
    if (n == 0) return vector<int>{-1};
    if (n == 1) return vector<int>{-1, 0};

    vector<int>next(n, 0);
    next[0] = -1;
    next[1] = 0;

    int j = 1, k = next[j];

    while (j < n - 1) {
        if (k < 0 || P[j] == P[k]) {
            next[++j] = ++k;
        }
        else {
            k = next[k];
        }
    }

    return next;
}

// 1.朴素匹配法: O(m*n)  
void Match(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if (n == 0 || m < n) return; // 处理特殊情况  

    for (int i = 0; i <= m - n; i++) {
        int j = 0;
        while (j < n && S[i + j] == P[j]) {
            j++;
        }
        if (j == n) { // 完全匹配  
            cout << "Pattern found at index " << i << endl;
        }
    }
}

// 2.Hash匹配法: O(m*n)  
void HashMatch(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if (n == 0 || m < n) return; // 处理特殊情况  

    long hash_p = Hash(P);
    long hash_s[Size] = { 0 };

    for (int i = 0; i + n <= m; i++) {
        hash_s[i] = Hash(S.substr(i, n));
    }

    for (int i = 0; i <= m - n; i++) {
        if (hash_p == hash_s[i]) {
            // 再检查一次确保哈希碰撞不误报  
            if (S.substr(i, n) == P) {
                cout << "Pattern found at index " << i << endl;
            }
        }
    }
}

// 3.Robin-Karp算法: O(m+n) 使用滚动数组优化 
void HashMatchPlus(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if (n == 0 || m < n) return; // 处理特殊情况  

    long hash_p = Hash(P);
    long hash_s = Hash(S.substr(0, n));
    long seed_n = 1; // seed^n % mod  

    // 预计算seed^n  （可以用快速幂优化）——见快速幂算法
    for (int i = 0; i < n; i++) {
        seed_n = (seed_n * seed) % mod;
    }

    for (int i = 0; i <= m - n; i++) {
        if (hash_p == hash_s) { // 再来一次朴素匹配  
            if (S.substr(i, n) == P) {
                cout << "Pattern found at index " << i << endl;
            }
        }
        if (i < m - n) {
            hash_s = (hash_s * seed + S[i + n] - S[i] * seed_n) % mod; // 更新哈希  
            if (hash_s < 0) {
                hash_s += mod; // 确保哈希值为正  
            }
        }
    }
}

// 4.KMP算法: O(m+n)
void KMP(const string& S, const string& P) {
    int m = S.size();
    int n = P.size();

    if (m < n || n == 0 || m == 0) return;

    vector<int> next = Next(P);

    int i = 0, j = 0;

    while (i < m) {
        if (j < 0 || S[i] == P[j]) {
            i++;
            j++;
        }
        else {
            j = next[j];
        }

        if (j == n) {
            cout << "Pattern found at index " << i - j << endl;
            i--;
            j = next[j - 1];
        }
    }
}

//5.后缀数组法：O(n^2logn)
class Suffix {
public:
    string suff;
    int index;
    Suffix(string suff, int index) : suff(suff), index(index) {}


    //比较函数，进行字典序排序
    static bool cmp(Suffix a, Suffix b) {
        return a.suff < b.suff;
    }

    //构建后缀数组
    static vector<Suffix> buildSuffixArray(string s, int n)
    {
        vector<Suffix> suffixArray;

        for (int i = 0; i < n; i++) {
            suffixArray.push_back(Suffix(s.substr(i), i));
        }

        sort(suffixArray.begin(), suffixArray.end(), cmp);

        for(int i = 0; i < n; i++)
			cout << suffixArray[i].suff << " " << suffixArray[i].index << endl;
        cout<<endl;

        return suffixArray;
    }

    //使用二分查找搜索目标字符串
    static void search(string pat, string txt, vector<Suffix> suffixArray)
    {
        int m = pat.size();
        int n = txt.size();

        int l = 0, r = n - 1, mid = 0;

        bool found = false;
        while (l <= r)
        {
            mid = l + (r - l) / 2;

            string substr = suffixArray[mid].suff.substr(0, m);

            if (substr == pat) {
                break;
            }
            else if (substr < pat)
                l = mid + 1;
            else
                r = mid - 1;
        }


        cout << "Pattern found at index " << suffixArray[mid].index << endl;

        int cnt = mid;
        while (cnt > 0 && suffixArray[cnt - 1].suff.substr(0, m) == pat) {
            cout << "Pattern found at index " << suffixArray[--cnt].index << endl;
        }

        cnt = mid;
        while (cnt < n - 1 && suffixArray[cnt + 1].suff.substr(0, m) == pat) {
            cout << "Pattern found at index " << suffixArray[++cnt].index << endl;
        }
    }
};

//6.后缀（Suffix）数组优化: O(n(logn)^2)
class SuffixArray {
public:
    SuffixArray(const string& S) : S(S), n(S.size()) {
        sa = Cal_SA();
    }

    // Calculate suffix array  
    vector<int> Cal_SA() {
        vector<int> rank(n), temp_rank(n), sa(n);

        // Initialize suffix array  
        for (int i = 0; i < n; ++i) {
            sa[i] = i;
            rank[i] = S[i];
        }

        // K is the current number of character pairs  
        for (int k = 1; k < n; k *= 2) {
            // Sort the suffix array based on rank  
            auto cmp = [this, &rank, k](int a, int b) {
                if (rank[a] != rank[b]) return rank[a] < rank[b];
                int ra = (a + k < n) ? rank[a + k] : -1;
                int rb = (b + k < n) ? rank[b + k] : -1;
                return ra < rb;
            };

            sort(sa.begin(), sa.end(), cmp);

            // Generate new rank  
            temp_rank[sa[0]] = 0;
            for (int i = 1; i < n; ++i) {
                temp_rank[sa[i]] = temp_rank[sa[i - 1]] + (cmp(sa[i - 1], sa[i]) ? 1 : 0);
            }
            swap(rank, temp_rank);

            // Early termination
            //if (rank[sa[n - 1]] == n - 1) break;
            
            /*//output rank:
            cout << "rank: ";
            for (int i = 0; i < n; i++) {
				cout << rank[i] << " ";
			}
            cout << endl;
            */
        }
        return sa;
    }

    // Suffix array match function  
    void SA_Match(const string& P) {
        int m = P.size();
        if (m == 0) return;

        // Binary search on sorted SA  
        int left = 0, right = n - 1, mid = 0;
        while (left <= right) {
            mid = left + (right - left) / 2;
            if (S.compare(sa[mid], m, P) < 0)
                left = mid + 1;
            else if (S.compare(sa[mid], m, P) > 0)
                right = mid - 1;
            else break;
        }

        cout << "Pattern found at index " << sa[mid] << endl;

        int cnt = mid;
        while (cnt > 0 && S.compare(sa[cnt - 1], m, P) == 0) {
            cout << "Pattern found at index " << sa[--cnt] << endl;
        }

        cnt = mid;
        while (cnt < n - 1 && S.compare(sa[cnt + 1], m, P) == 0) {
            cout << "Pattern found at index " << sa[++cnt] << endl;
        }
    }

    vector<int> getSA() const { return sa; } // Added getter for SA  

private:
    string S;
    int n;
    vector<int> sa;
};

//后缀数组的最长公共前缀（LCP）数组
class LCPArray {
public:
    LCPArray(const string& S, const vector<int>& _sa) : S(S), sa(_sa), n(S.size()) {
        lcp = Cal_LCP();
    }

    // Calculate LCP array  
    vector<int> Cal_LCP() {
        vector<int> rank(n), lcp(n);
        for (int i = 0; i < n; ++i) rank[sa[i]] = i;

        int h = 0;
        for (int i = 0; i < n; ++i) {
            if (rank[i] == 0) continue;
            int j = sa[rank[i] - 1];
            if (h > 0) --h;
            while (i + h < n && j + h < n && S[i + h] == S[j + h]) {
                h++;
            }
            lcp[rank[i]] = h;
        }

        return lcp;
    }

    // Output LCP array  
    void Print_LCP() {
        for (auto i : lcp)
            cout << i << " ";
        cout << endl;
    }

private:
    string S;
    vector<int> sa;
    int n;
    vector<int> lcp;
};

int main() {
    string S = "abcabc";
    string P = "bc";

    //朴素匹配：时间复杂度O(n*m)
    cout << "Match: " << endl;
    Match(S, P);

    //Hash匹配：时间复杂度O(m*n)
    cout << "HashMatch: " << endl;
    HashMatch(S, P);

    //Hash匹配优化：时间复杂度O(n+m)
    cout << "HashMatchPlus: " << endl;
    HashMatchPlus(S, P);

    //KMP匹配：时间复杂度O(n+m)
    cout << "KMP: " << endl;
    KMP(S, P);

    //后缀数组匹配：时间复杂度O(n^2 lgn)
    cout << "SA: " << endl;
    vector<Suffix>suffixArray = Suffix::buildSuffixArray(S, S.size());
    Suffix::search(P, S, suffixArray);

    //后缀数组倍增法优化：时间复杂度O(n lgn)
    cout << "SAO: " << endl;
    SuffixArray sa(S);
    sa.SA_Match(P);

    //后缀数组的最长公共前缀（LCP）数组
    cout << "LCP: " << endl;
    LCPArray lcpArray(S, sa.getSA());
    lcpArray.Print_LCP();

    return 0;
}