算法练习三

最新推荐文章于 2022-01-03 10:15:16 发布

上帝爱民

最新推荐文章于 2022-01-03 10:15:16 发布

阅读量164

点赞数

分类专栏：算法题文章标签：算法字符串

本文链接：https://blog.csdn.net/qq_44781747/article/details/111438962

版权

算法题专栏收录该内容

30 篇文章 1 订阅

订阅专栏

算法分析与设计练习三

内容

分别用KMP、Monte Carlo和Las Vegas算法编写subString，随机生成10000对、长度较长、且长度不等的01文本串X(haystack)和模式Y(needle)（三个程序处理相同的串）

假设文本串长度为size1，模式串长度为size2

1.subStringKMP

思路：

(1).KMP的主要思想
当出现字符串不匹配时，可以知道一部分之前已经匹配的文本内容，可以利用这些信息避免从头再去做匹配了。

(2).next数组 == 前缀表（prefix table）
1)前缀表记录下标i之前（包括i）的字符串中，有多大长度的相同前缀后缀。
2)前缀表是用来回退的，它记录了模式串与文本串不匹配的时候，模式串应该从哪里开始重新匹配。
KMP

3)下标5(包括5)之前这部分的字符串（也就是字符串aabaa）的最长相等的前缀和后缀字符串是子字符串aa 。因为找到了最长相等的前缀和后缀，匹配失败的位置是后缀子串的后面，那么我们找到与其相同的前缀的后面从新匹配就可以了。

(3).构造next数组
其实就是计算模式串s，前缀表的过程。
1）初始化
2）处理前后缀不相同的情况：向前回溯
3）处理前后缀相同的情况

2.subStringMC

Monte Carlo算法：

总能得到问题的一个解，但不一定是正确解。然而可以通过多次运行原算法，并且满足每次运行时的随机选择都相互独立，这样就使产生非正确解的概率减到任意小。

思路：

通过对字符串“取指纹”判断字符串是否相等
取指纹的方法：

由于文本串和模式串是由0-1组成，将文本串和模式串由二进制转换为十进制。选取一个质数P ，对文本串和模式串的十进制编码与P做取模运算。若取模结果相同，则视为二者匹配。

文本串取指纹的优化方法：

由于在文本串中相邻的子串的“取指纹”值满足下面的关系：

getFp(X(i + 1)) = (2 * getFp(X(i)) - 2 ^ size2 * X[i] + X[i + size2]) % P,

其中getFp为取指纹函数，X(i)表示文本串中以X[i]为开头，长度为size2的子串

3.subStringLS

Las Vegas算法：

不断调用随机算法求解，直到求得正确解或调用次数达到某个阈值。所以，不一定能得到解，如果能得到解，一定是正确解。

思路：

在Monte Carlo算法的基础上进行优化：

当两字符串的取指纹结果相等时，则进行逐一比对，若比对结果相同，则二者匹配；若比对结果不同，则继续向后进行模式匹配。

4.源码

#include <iostream>
#include <vector>
#include <string>
#include <cstdlib>
#include <ctime>
#include <cmath>

using namespace std;

//构造next数组
void getNext(vector<int>& next, const string& needle)
{
    //初始化
    int j = -1;
    next[0] = j;

    for(int i = 1;i < needle.size();++i)
    {
        //前后缀不相同的情况：向前回溯
        while (j >= 0 && needle[i] != needle[j + 1])
        {
            j = next[j];
        }

        //前后缀相同的情况
        if(needle[i] == needle[j + 1])
        {
            ++j;
        }

        next[i] = j;
    }
}

//KMP算法,从文本串haystack中找到模式串needle第一次出现的位置
int subStringKMP(string haystack, string needle)
{
    if (needle.size() == 0) return 0;
    if (haystack.size() == 0) return -1;
    if (haystack.size() < needle.size()) return -1;

    int size1 = haystack.size();
    int size2 = needle.size();

    vector<int> next(size2);
    getNext(next, needle);

    int j = -1;
    for(int i = 0;i < size1;++i)
    {
        while(j >= 0 && haystack[i] != needle[j + 1])
        {
            j=next[j];
        }

        if(haystack[i] == needle[j + 1])
        {
            ++j;
        }

        if (j == size2 - 1)
        {
            return i - size2 + 1;
        }
    }

    return -1;
}

//大素数
const int mod = 1e9 + 7;

//辅助函数, 将二进制转换为十进制
long long binaryToDecimal(string s)
{
    int size = s.size();

    long long sum = 0;

    for (int i = 0;i < size;++i)
    {
        if (s[i] == '1')
        {
            sum = sum + pow(2, size - 1 - i);
        }
    }

    return sum;
}

//辅助函数，取指纹
int getFingerprint(long long decimal)
{
    return decimal % mod;
}


//Monte Carlo算法，从文本串haystack中找到模式串needle第一次出现的位置
int subStringMC(string haystack, string needle)
{
    int size1 = haystack.size();
    int size2 = needle.size();
    if (size2 == 0) return 0;

    long long needleDecimal = binaryToDecimal(needle);
    long long haystackDecimal = binaryToDecimal(haystack.substr(0, size2));

    int needleFingerprint = getFingerprint(needleDecimal);
    int haystackFingerprint = getFingerprint(haystackDecimal);

    for (int i = 0;i <= size1 - size2;++i)
    {
        if (needleFingerprint == haystackFingerprint)
        {
            return i;
        }

        haystackFingerprint = 
        (2 * haystackFingerprint - static_cast<int>(pow(2, size2))* haystack[i]          + haystack[i + size2]) % mod;
    }

    return -1;
}

//Las Vegas算法，从文本串haystack中找到模式串needle第一次出现的位置
int subStringLV(string haystack, string needle)
{
    int size1 = haystack.size();
    int size2 = needle.size();
    if (size2 == 0) return 0;

    long long needleDecimal = binaryToDecimal(needle);
    long long haystackDecimal = binaryToDecimal(haystack.substr(0, size2));

    int needleFingerprint = getFingerprint(needleDecimal);
    int haystackFingerprint = getFingerprint(haystackDecimal);

    for (int i = 0;i <= size1 - size2;++i)
    {
        if (needleFingerprint == haystackFingerprint)
        {
            int j;
            for (j = 0; haystack[i + j] == needle[j] && j < size2; ++j);
            if (j == size2) return i;
        }

        haystackFingerprint = 
        (2 * haystackFingerprint - static_cast<int>(pow(2, size2))* haystack[i]          + haystack[i + size2]) % mod;
    }

    return -1;
}

int main()
{
    //文本串数组
    vector<string> haystacks(10000);
    //模式串数组
    vector<string> needles(10000);

    //初始化
    srand(time(0));
    for (int i = 0;i < 10000;++i)
    {
        for (int j = 0;j < 500;++j)
        {
            haystacks[i].push_back((rand() % 2) + '0');

        }

        for (int k = 0;k < 50;++k)
        {
            needles[i].push_back((rand() % 2) + '0');

        }
    }

    //KMP算法
    vector<int> result1(10000); //存储10000对字符串的匹配结果
    clock_t start1 = clock();
    for (int i = 0;i < 10000;++i)
    {
        result1.emplace_back(subStringKMP(haystacks[i], needles[i]));

    }
    clock_t finish1 = clock();

    double duration1 = static_cast<double>(finish1 - start1);
    cout << "KMP算法的运行时间为: " << duration1 << "毫秒" << endl;

    //Monte Carlo算法
    vector<int> result2(10000); //存储10000对字符串的匹配结果
    clock_t start2 = clock();
    for (int i = 0;i < 10000;++i)
    {
        result2.emplace_back(subStringMC(haystacks[i], needles[i]));

    }
    clock_t finish2 = clock();

    double duration2 = static_cast<double>(finish2 - start2);
    cout << "Monte Carlo算法的运行时间为: " << duration2 << "毫秒" << endl;

    //Las Vegas算法
    vector<int> result3(10000); //存储10000对字符串的匹配结果
    clock_t start3 = clock();
    for (int i = 0;i < 10000;++i)
    {
        result3.emplace_back(subStringLV(haystacks[i], needles[i]));

    }
    clock_t finish3 = clock();

    double duration3 = static_cast<double>(finish3 - start3);
    cout << "Las Vegas算法的运行时间为: " << duration3 << "毫秒" << endl;

    int count = 0;
    for (int i = 0;i < 10000;++i)
    {
        if (result1[i] == result2[i])
        {
            ++count;
        }
    }
    double accuracy = count / 10000;
    cout << "Monte Carlo算法的准确率为:" << accuracy * 100 << "%" << endl;

    bool flag = (result1 == result3);
    if (flag)
    {
        cout << "Las Vegas算法正确" << endl;

    }

    return 0;
}

6.时间复杂度

KMP: O(size1 + size2)

Monte Carlo: O(size1 + size2)

Las Vegas: O(size1 + size2)

7.结论

随机数P应当取一个较大的质数。当P一定大时，Monte Carlo算法的出错率要比1/size1小得多。而Las Vegas算法一定正确。

上帝爱民

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
算法练习三

算法分析与设计练习三内容分别用KMP、Monte Carlo和Las Vegas算法编写subString，随机生成10000对、长度较长、且长度不等的01文本串X(haystack)和模式Y(needle)（三个程序处理相同的串）假设文本串长度为size1，模式串长度为size21.subStringKMP思路：(1).KMP的主要思想当出现字符串不匹配时，可以知道一部分之前已经匹配的文本内容，可以利用这些信息避免从头再去做匹配了。(2).next数组 == 前缀表（prefix tab
复制链接

扫一扫