Python中的stringlib字符串查找算法是Boyer-Moore,Horspool, Sunday, Bloom Filter几种算法的合成体, 大概的原理如下:
def find(s, p):
# find first occurrence of p in s
n = len(s)
m = len(p)
skip = delta1(p)[p[m-1]]
i = 0
while i <= n-m:
if s[i+m-1] == p[m-1]: # (boyer-moore)
# potential match
if s[i:i+m-1] == p[:m-1]:
return i
if s[i+m] not in p:
i = i + m + 1 # (sunday)
else:
i = i + skip # (horspool)
else:
# skip
if s[i+m] not in p:
i = i + m + 1 # (sunday)
else:
i = i + 1
return -1 # not found
以下是具体实现:
/* stringlib: fastsearch implementation */
#ifndef STRINGLIB_FASTSEARCH_H
#define STRINGLIB_FASTSEARCH_H
#include
#include
#include
#include
#include
/* fast search/count implementation, based on a mix between boyer-
moore and horspool, with a few more bells and whistles on the top.
for some more background, see: http://effbot.org/zone/stringlib.htm */
/* note: fastsearch may access s[n], which isn't a problem when using
Python's ordinary string types, but may cause problems if you're
using this code in other contexts. also, the count mode returns -1
if there cannot possible be a match in the target string, and 0 if
it has actually checked for matches, but didn't find any. callers
beware! */
#define FAST_COUNT 0
#define FAST_SEARCH 1
#define FAST_RSEARCH 2
#ifndef LONG_BIT
#define LONG_BIT 32
#endif
#if LONG_BIT >= 128
#define STRINGLIB_BLOOM_WIDTH 128
#elif LONG_BIT >= 64
#define STRINGLIB_BLOOM_WIDTH 64
#elif LONG_BIT >= 32
#define STRINGLIB_BLOOM_WIDTH 32
#else
#error "LONG_BIT is smaller than 32"
#endif
#define STRINGLIB_BLOOM_ADD(mask, ch) \
((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
#define STRINGLIB_BLOOM(mask, ch) \
((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
ssize_t fastsearch(const char *s, ssize_t n,
const char *p, ssize_t m,
ssize_t maxcount, int mode)
{
unsigned long mask;
ssize_t skip, count = 0;
ssize_t i, j, mlast, w;
w = n - m;
if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) {
return -1;
}
/* look for special cases */
if (m <= 1) {
if (m <= 0) {
return -1;
}
/* use special case for 1-character strings */
if (mode == FAST_COUNT) {
for (i = 0; i < n; i++)
if (s[i] == p[0]) {
count++;
if (count == maxcount) {
return maxcount;
}
}
return count;
}
else if (mode == FAST_SEARCH) {
for (i = 0; i < n; i++)
if (s[i] == p[0]) {
return i;
}
}
else { /* FAST_RSEARCH */
for (i = n - 1; i > -1; i--)
if (s[i] == p[0]) {
return i;
}
}
return -1;
}
mlast = m - 1;
skip = mlast - 1;
mask = 0;
if (mode != FAST_RSEARCH) {
/* create compressed boyer-moore delta 1 table */
/* process pattern[:-1] */
for (i = 0; i < mlast; i++) {
STRINGLIB_BLOOM_ADD(mask, p[i]);
if (p[i] == p[mlast]) {
skip = mlast - i - 1;
}
}
/* process pattern[-1] outside the loop */
STRINGLIB_BLOOM_ADD(mask, p[mlast]);
for (i = 0; i <= w; i++) {
/* note: using mlast in the skip path slows things down on x86 */
if (s[i + m - 1] == p[m - 1]) {
/* candidate match */
for (j = 0; j < mlast; j++)
if (s[i + j] != p[j]) {
break;
}
if (j == mlast) {
/* got a match! */
if (mode != FAST_COUNT) {
return i;
}
count++;
if (count == maxcount) {
return maxcount;
}
i = i + mlast;
continue;
}
/* miss: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, s[i + m])) {
i = i + m;
}
else {
i = i + skip;
}
}
else {
/* skip: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, s[i + m])) {
i = i + m;
}
}
}
}
else { /* FAST_RSEARCH */
/* create compressed boyer-moore delta 1 table */
/* process pattern[0] outside the loop */
STRINGLIB_BLOOM_ADD(mask, p[0]);
/* process pattern[:0:-1] */
for (i = mlast; i > 0; i--) {
STRINGLIB_BLOOM_ADD(mask, p[i]);
if (p[i] == p[0]) {
skip = i - 1;
}
}
for (i = w; i >= 0; i--) {
if (s[i] == p[0]) {
/* candidate match */
for (j = mlast; j > 0; j--)
if (s[i + j] != p[j]) {
break;
}
if (j == 0)
/* got a match! */
{
return i;
}
/* miss: check if previous character is part of pattern */
if (!STRINGLIB_BLOOM(mask, s[i - 1])) {
i = i - m;
}
else {
i = i - skip;
}
}
else {
/* skip: check if previous character is part of pattern */
if (!STRINGLIB_BLOOM(mask, s[i - 1])) {
i = i - m;
}
}
}
}
if (mode != FAST_COUNT) {
return -1;
}
return count;
}
#endif
测试代码
#include
int main(int argc, char **argv)
{
char *str = "GET / HTTP 1.0\r\nHost: www.xxx.com\r\nCache: \r\nCache:\r\n Length:\r\n";
ssize_t rc = 0;
uint64_t start, end;
start = get_cycle_count();
rc = fastsearch(str, strlen(str), "Cache:", 6, 2, FAST_SEARCH);
end = get_cycle_count();
printf("fastsearch return %u cost %llu \n", rc, end - start);
printf("result = %s\n", str + rc);
rc = fastsearch(str, strlen(str), "Cache:", 6, -1, FAST_COUNT);
printf("result = %u\n", rc);
return 0;
}
看stringlib测试数据, 还是蛮可以的.
我在tile平台上测试发现还没有snort中的BMH算法速度快.
不过这个只是单一测试, 没有考虑到cache的情况, 仅供参考.
原文参考:
分享到:
2012-01-11 14:50
浏览 1047
评论