介绍
BM 算法在文本模式匹配方面的效率不用多说,在 linux 上文件搜索上的使用也证明了它的实用价值,这有一篇它的原理介绍, BM算法介绍。
Show you the code
#include <algorithm>
#include <iostream>
#include "BM.h"
#define FS_MATCH_NOCASE
#ifdef FS_MATCH_NOCASE
#define bm_tolower(c) \
if(c >= BDS_TEXT('A') && c <= BDS_TEXT('Z')) c+= 32;
#else
#define bm_tolower(c)
#endif
namespace BaiduService
{
namespace FileSearch
{
CMBMMatch::CBMMatch::CBMMatch() :
m_pattern(NULL),
m_arraySuffixTable(NULL),
m_nPatternLen(0)
{
m_pArrayBadCharTable = (char *)malloc(1 << 16);
memset(m_pArrayBadCharTable, -1, 1 << 16);
}
CMBMMatch::CBMMatch::~CBMMatch()
{
if (m_arraySuffixTable)
{
free(m_arraySuffixTable);
}
if (m_pattern != NULL)
{
free(m_pattern);
}
if (m_pArrayBadCharTable != NULL)
{
free(m_pArrayBadCharTable);
}
}
bool CMBMMatch::CBMMatch::SetPattern(const BDS_TCHAR *pattern)
{
// bad char table
m_nPatternLen = bdststrlen(pattern);
if (m_nPatternLen < 1)
{
return false;
}
if (m_pattern != NULL)
{
free(m_pattern);
}
m_pattern = (BDS_TCHAR *)malloc((m_nPatternLen + 1) * sizeof(BDS_TCHAR));
memset(m_pattern, 0, (m_nPatternLen + 1) * sizeof(BDS_TCHAR));
#ifndef FS_MATCH_NOCASE
bdsstrcpy(m_pattern, pattern);
#else //
for (size_t i = 0; i < m_nPatternLen; i++)
{
BDS_TCHAR c = pattern[i];
bm_tolower( c );
m_pattern[i] = c;
}
#endif
for (size_t i = 0; i < m_nPatternLen; i++)
{
m_pArrayBadCharTable[(USHORT)m_pattern[i]] = i;
}
// build good suffix table, from right to left
m_arraySuffixTable = (char *)malloc(m_nPatternLen);
char cLastGoodSuffix = 1;
for (size_t i = m_nPatternLen - 1; i > 0; i--)
{
// compare the prefix
size_t j = 0;
for (; j < m_nPatternLen - i; j++)
{
if ( m_pattern[j] != m_pattern[j + i] )
{
break;
}
}
if (j == m_nPatternLen - i)
{
m_arraySuffixTable[i] = i;
cLastGoodSuffix = i;
}
else
{
m_arraySuffixTable[i] = cLastGoodSuffix;
}
}
// ajust the last subffix
m_arraySuffixTable[0] = cLastGoodSuffix;
m_arraySuffixTable[m_nPatternLen - 1] = 1;
// find the good suffix
for (size_t i = m_nPatternLen - 1; i > 0; i--)
{
int nLocation = 0;
nLocation = FindSubString( &m_pattern[i], &m_pattern[0] );
if (nLocation != 0)
{
m_arraySuffixTable[i-1] = i - nLocation;
}
}
return true;
}
bool CMBMMatch::CBMMatch::Match(const BDS_TCHAR *text)
{
// do search
bool bFound = false;
size_t nTextLen = bdststrlen(text);
if (nTextLen < m_nPatternLen)
{
return false;
}
size_t i = 0;
for ( ; i < nTextLen;)
{
int j = m_nPatternLen - 1;
for ( ; j >= 0; j--)
{
if (i + j > nTextLen - 1)
{
return false;
}
BDS_TCHAR c = text[i + j];
bm_tolower(c);
if (m_pattern[j] != c )
{
// find the next jump
if ((j - m_pArrayBadCharTable[(USHORT)c]) > m_arraySuffixTable[j])
{
i += (j - m_pArrayBadCharTable[(USHORT)c]);
}
else
{
i += m_arraySuffixTable[j];
}
break;
}
}
if (j == -1)
{
bFound = true;
break;
}
}
return bFound;
}
int CMBMMatch::CBMMatch::FindSubString(const BDS_TCHAR *szPattern, const BDS_TCHAR *szText)
{
int nLocation = 0;
int nLastLocation = 0;
size_t nTextLen = bdststrlen( szText );
size_t nPatternLen = bdststrlen( szPattern );
for (size_t i = 0; i < nTextLen; i++)
{
size_t j = 0;
for ( ; j < nPatternLen; j++ )
{
if ( szPattern[j] != szText[i + j] )
{
break;
}
}
if ( j == nPatternLen )
{
nLastLocation = nLocation;
nLocation = i;
}
}
return nLastLocation;
}
CMBMMatch::CMBMMatch()
{
}
CMBMMatch::~CMBMMatch()
{
Reset();
}
bool CMBMMatch::Reset()
{
std::for_each(m_vecMatch.begin(), m_vecMatch.end(), [&](CBMMatch *match) {
delete match;
});
m_vecMatch.clear();
m_vecPatterns.clear();
return true;
}
bool CMBMMatch::SetPatterns(std::vector<bdststring> vecPatterns)
{
Reset();
m_vecPatterns = vecPatterns;
std::for_each(m_vecPatterns.begin(), m_vecPatterns.end(), [&](bdststring pattern) {
CBMMatch *match = new CBMMatch();
if (match->SetPattern(pattern.c_str()))
{
m_vecMatch.push_back(match);
}
else
{
delete match;
}
});
return true;
}
bool CMBMMatch::Match(const BDS_TCHAR *text)
{
bool bFound = true;
//bFound = m_vecMatch[0]->Match(text);
std::all_of(m_vecMatch.begin(), m_vecMatch.end(), [&](CBMMatch *match) -> bool {
if (!match->Match(text))
{
bFound = false;
return false;
}
return true;
});
return bFound;
}
}
}
介绍没有经过优化以及详细 review,写好之后就没有再看了,难免有问题,可以自行处理。