字符串查找算法python_提取Python stringlib中的"BMHBNFS"字符串查找算法 - baiguomeng - ITeye博客...

最新推荐文章于 2022-03-31 17:43:01 发布

司幽幽

最新推荐文章于 2022-03-31 17:43:01 发布

阅读量126

点赞数

文章标签：字符串查找算法python

本文链接：https://blog.csdn.net/weixin_42119989/article/details/113715225

版权

Python中的stringlib字符串查找算法是Boyer-Moore,Horspool, Sunday, Bloom Filter几种算法的合成体, 大概的原理如下:

def find(s, p):

# find first occurrence of p in s

n = len(s)

m = len(p)

skip = delta1(p)[p[m-1]]

i = 0

while i <= n-m:

if s[i+m-1] == p[m-1]: # (boyer-moore)

# potential match

if s[i:i+m-1] == p[:m-1]:

return i

if s[i+m] not in p:

i = i + m + 1 # (sunday)

else:

i = i + skip # (horspool)

else:

# skip

if s[i+m] not in p:

i = i + m + 1 # (sunday)

else:

i = i + 1

return -1 # not found

以下是具体实现:

/* stringlib: fastsearch implementation */

#ifndef STRINGLIB_FASTSEARCH_H

#define STRINGLIB_FASTSEARCH_H

#include

/* fast search/count implementation, based on a mix between boyer-

moore and horspool, with a few more bells and whistles on the top.

for some more background, see: http://effbot.org/zone/stringlib.htm */

/* note: fastsearch may access s[n], which isn't a problem when using

Python's ordinary string types, but may cause problems if you're

using this code in other contexts. also, the count mode returns -1

if there cannot possible be a match in the target string, and 0 if

it has actually checked for matches, but didn't find any. callers

beware! */

#define FAST_COUNT 0

#define FAST_SEARCH 1

#define FAST_RSEARCH 2

#ifndef LONG_BIT

#define LONG_BIT 32

#endif

#if LONG_BIT >= 128

#define STRINGLIB_BLOOM_WIDTH 128

#elif LONG_BIT >= 64

#define STRINGLIB_BLOOM_WIDTH 64

#elif LONG_BIT >= 32

#define STRINGLIB_BLOOM_WIDTH 32

#else

#error "LONG_BIT is smaller than 32"

#endif

#define STRINGLIB_BLOOM_ADD(mask, ch) \

((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))

#define STRINGLIB_BLOOM(mask, ch) \

((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))

ssize_t fastsearch(const char *s, ssize_t n,

const char *p, ssize_t m,

ssize_t maxcount, int mode)

{

unsigned long mask;

ssize_t skip, count = 0;

ssize_t i, j, mlast, w;

w = n - m;

if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) {

return -1;

}

/* look for special cases */

if (m <= 1) {

if (m <= 0) {

return -1;

}

/* use special case for 1-character strings */

if (mode == FAST_COUNT) {

for (i = 0; i < n; i++)

if (s[i] == p[0]) {

count++;

if (count == maxcount) {

return maxcount;

}

return count;

}

else if (mode == FAST_SEARCH) {

for (i = 0; i < n; i++)

if (s[i] == p[0]) {

return i;

}

else { /* FAST_RSEARCH */

for (i = n - 1; i > -1; i--)

if (s[i] == p[0]) {

return i;

}

return -1;

}

mlast = m - 1;

skip = mlast - 1;

mask = 0;

if (mode != FAST_RSEARCH) {

/* create compressed boyer-moore delta 1 table */

/* process pattern[:-1] */

for (i = 0; i < mlast; i++) {

STRINGLIB_BLOOM_ADD(mask, p[i]);

if (p[i] == p[mlast]) {

skip = mlast - i - 1;

}

/* process pattern[-1] outside the loop */

STRINGLIB_BLOOM_ADD(mask, p[mlast]);

for (i = 0; i <= w; i++) {

/* note: using mlast in the skip path slows things down on x86 */

if (s[i + m - 1] == p[m - 1]) {

/* candidate match */

for (j = 0; j < mlast; j++)

if (s[i + j] != p[j]) {

break;

}

if (j == mlast) {

/* got a match! */

if (mode != FAST_COUNT) {

return i;

}

count++;

if (count == maxcount) {

return maxcount;

}

i = i + mlast;

continue;

}

/* miss: check if next character is part of pattern */

if (!STRINGLIB_BLOOM(mask, s[i + m])) {

i = i + m;

}

else {

i = i + skip;

}

else {

/* skip: check if next character is part of pattern */

if (!STRINGLIB_BLOOM(mask, s[i + m])) {

i = i + m;

}

else { /* FAST_RSEARCH */

/* create compressed boyer-moore delta 1 table */

/* process pattern[0] outside the loop */

STRINGLIB_BLOOM_ADD(mask, p[0]);

/* process pattern[:0:-1] */

for (i = mlast; i > 0; i--) {

STRINGLIB_BLOOM_ADD(mask, p[i]);

if (p[i] == p[0]) {

skip = i - 1;

}

for (i = w; i >= 0; i--) {

if (s[i] == p[0]) {

/* candidate match */

for (j = mlast; j > 0; j--)

if (s[i + j] != p[j]) {

break;

}

if (j == 0)

/* got a match! */

{

return i;

}

/* miss: check if previous character is part of pattern */

if (!STRINGLIB_BLOOM(mask, s[i - 1])) {

i = i - m;

}

else {

i = i - skip;

}

else {

/* skip: check if previous character is part of pattern */

if (!STRINGLIB_BLOOM(mask, s[i - 1])) {

i = i - m;

}

if (mode != FAST_COUNT) {

return -1;

}

return count;

}

#endif

测试代码

#include

int main(int argc, char **argv)

{

char *str = "GET / HTTP 1.0\r\nHost: www.xxx.com\r\nCache: \r\nCache:\r\n Length:\r\n";

ssize_t rc = 0;

uint64_t start, end;

start = get_cycle_count();

rc = fastsearch(str, strlen(str), "Cache:", 6, 2, FAST_SEARCH);

end = get_cycle_count();

printf("fastsearch return %u cost %llu \n", rc, end - start);

printf("result = %s\n", str + rc);

rc = fastsearch(str, strlen(str), "Cache:", 6, -1, FAST_COUNT);

printf("result = %u\n", rc);

return 0;

}

看stringlib测试数据, 还是蛮可以的.

我在tile平台上测试发现还没有snort中的BMH算法速度快.

不过这个只是单一测试, 没有考虑到cache的情况, 仅供参考.

原文参考:

分享到：

2012-01-11 14:50

司幽幽

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
字符串查找算法python_提取Python stringlib中的"BMHBNFS"字符串查找算法 - baiguomeng - ITeye博客...

Python中的stringlib字符串查找算法是Boyer-Moore,Horspool, Sunday, Bloom Filter几种算法的合成体, 大概的原理如下:def find(s, p):# find first occurrence of p in sn = len(s)m = len(p)skip = delta1(p)[p[m-1]]i = 0while i <= n-m:...
复制链接

扫一扫