bm算法是从后往前匹配的
bm算法利用了两个规则,坏字符和好后缀
#include <iostream>
#include <string>
#include <cstring>
#include <algorithm>
using namespace std;
const size_t MAX_CHAR = 256;
/**
* 计算坏字符数组
* bad_char[i]代表从最后一个字符i到最后一个字符的距离
* @param patten 模式串
* @param bad_char 坏字符数组
*/
void get_bad_char(const string& patten, int bad_char[]) {
int patten_len = patten.size();
for (int i = 0; i < MAX_CHAR; ++i) {
bad_char[i] = patten_len;
}
for (int i = 0; i < patten_len; ++i) {
//保留最后一个,
bad_char[patten[i]] = patten_len - i - 1;
}
}
/**
* 计算后缀数组
* suffix[i]代表patten[0,1,2...i]与patten的最长公共后缀
* @param patten 模式串
* @param suffix 后缀数组
*/
void get_suffix(const string& patten, int suffix[]) {
int patten_len = patten.size();
suffix[patten_len - 1] = patten_len;
// 朴素方法
// for(int i=patten_len-2;i>=0;--i){
// int j=i;
// while(j>=0&&patten[j]==patten[patten_len-1-(i-j)]){
// --j;
// }
// suffix[i]=i-j;
// }
//最后一次失配的位置
int last_fail_pos = patten_len - 1;
//最后一个开始匹配的位置
int last_start_pos = patten_len - 1;
for (int i = patten_len - 2; i >= 0; --i) {
//如果i在最后一次失配和最后一次开始匹配的位置之间,且对应位置的最长公共后缀的长度不超过上次匹配的长度
if (last_fail_pos < i && suffix[patten_len - 1 - (last_start_pos - i)] < i - last_fail_pos) {
suffix[i] = suffix[patten_len - 1 - (last_start_pos - i)];
}
else {
if (i < last_fail_pos) {
last_fail_pos = i;
}
last_start_pos = i;
//朴素匹配
while (last_fail_pos >= 0 && patten[last_fail_pos] == patten[patten_len - 1 - (last_start_pos - last_fail_pos)]) {
--last_fail_pos;
}
suffix[i] = last_start_pos - last_fail_pos;
}
}
}
/**
* 计算好后缀数组
* good_suffix[i]代表在i失配时用好后缀规则移动的距离
* @param patten 模式串
* @param good_suffix 好后缀数组
*/
void get_good_suffix(const string& patten, int good_suffix[]) {
int patten_len = patten.size();
int* suffix = new int[patten_len];
get_suffix(patten, suffix);
//初始化好后缀数组为字符串长度(失配)
for (int i = 0; i < patten_len; ++i) {
good_suffix[i] = patten_len;
}
int j = 0;
for (int i = patten_len - 2; i >= 0; --i) {
//利用前缀和后缀的最长公共子串
if (i + 1 == suffix[i]) {
//介于两者之间位置失配时,移动到前后缀匹配的部分
for (; j < patten_len - i - 1; ++j) {
//只计算没有计算过的
if (good_suffix[j] == patten_len) {
good_suffix[j] = patten_len - i - 1;
}
}
}
}
//后缀在字符串匹配
for (int i = 0; i < patten_len - 1; ++i) {
good_suffix[patten_len - suffix[i] - 1] = patten_len - i - 1;
}
delete[] suffix;
suffix = nullptr;
}
int bm(const string& target, const string& patten) {
if (patten.empty()) {
return 0;
}
int target_len = target.size();
int patten_len = patten.size();
int bad_char[MAX_CHAR];
int* good_suffix = new int[patten_len];
get_bad_char(patten, bad_char);
get_good_suffix(patten, good_suffix);
int target_pos = 0;
while (target_pos + patten_len <= target_len) {
int j = patten_len - 1;
while (j >= 0 && target[target_pos + j] == patten[j]) {
--j;
}
if (j < 0) {
delete[] good_suffix;
good_suffix = nullptr;
return target_pos;
}
else {
target_pos += max(bad_char[target[target_pos + j]] - (target_len - target_pos - 1), good_suffix[j]);
}
}
delete[] good_suffix;
good_suffix = nullptr;
return -1;
}
int main() {
string target="abbadcababacab";
string patten="babac";
//string target="bcabcdababcabaabcbcabababacbacabeeacda";
//string patten="bcababab";
//string target = "dieiahgjkriabddioababa";
//string patten = "eigha";
//string target = "";
//string patten = "";
cout << target.find(patten) << endl;
cout << bm(target, patten) << endl;
return 0;
}
参考资料