关键字查找

最新推荐文章于 2021-01-27 14:39:07 发布

W_SX12553

最新推荐文章于 2021-01-27 14:39:07 发布

阅读量3.7k

点赞数 2

分类专栏：算法文章标签：敏感词过滤关键词查找

本文链接：https://blog.csdn.net/W_SX12553/article/details/78652736

版权

算法专栏收录该内容

1 篇文章 0 订阅

订阅专栏

中文关键字查找(敏感词过滤)

背景

近来需要在极短的时间来查找某一段文字是否出现关键字(敏感字)的应用。网上找了一些资料，有用java写的双数组AC树，达到每秒27Mb 的速度。用c写的ac树每秒也只有 30M。

性能

多叉树 c 语言写的，
机器：Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
查找速率约： 111000 (bytes/ms)

实现原理(方法)

1.对于多匹配算法的查找，一般认为类似于 ac 算法的思想比较好，因为对于主串不用回溯比较。前提是对于类似于字典的关键词比较多时(即如 abc, abcdef, abcdff 这样前缀相同比较多的情况) 效率才会有明显提升。

但对于中文的敏感词方面，前缀重复率太低，用 AC自动机时，大部分情况下根本跳转就是和回溯算法无异，反而增加了比较(其实程序的比较主要是对不同内存访问，导致的多次miss cache才是最耗时的)，性能反而下降了。

2.这里采用的方法: 直接用多叉树，第一级用 24位(即分配存2^24次方个指针空间 = 2^27=128M的内存)，第二级用 8 位，第三级直接保存除前两级剩下的字节。

注意：这里采用了 bitmap 算法，主要是为了节省内存空间，因为比如有添加关键词 “我们的”、”我”时，因为”我”只占3个字节(也就是第一级索引就够了)。加入 bitmap 目的一是表示查找时到此已经匹配的话就算找到了(即主串出现了”我”)，二是删除时，后来用户突然想到删除关键词”我”,我只需要删除bitmap 这个标记位，这样就不影响”我们的”这个关键词的查找。

不足

这里采用了内存池的方法，但是速度也只能提升10% 左右。内存池方法写得很粗糙，这里只作演示。可重写或不想用内存池，就把 FindKeyword.c 文件中的 mpool_alloc函数改用 malloc 替换下就好。

源代码

如果你想直接运行下看效果，可以下载资源 git clone https://gitee.com/ben_wsx/FindKeyword.git 。
(http://download.csdn.net/download/w_sx12553/10135616 这里的资源是旧的，恶心的CSDN却不能删除)或者就复制下面的文件

主要代码如下：

FindKeyword.c

//
// 功能：      测试一下树，2^24叉树作为第一级,进行敏感词过滤
//
//
//                                          ben
//                                          2017.11.23
//
//

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdarg.h>
#include <memory.h>
#include <stdint.h>
#include <stdbool.h>
#include <unistd.h>  
#include <sys/types.h>  
#include <sys/stat.h>  
#include <dirent.h> 

#include "getTime.h"
#include "mpool.h"


#define _TEST_  // 定义为测试

#ifdef _TEST_
char* g_findKeyword = NULL; // 找到的关键字
int   g_findKeyword_Len = 0;
#endif


#define GetFirstIndex(i) (((i)>>8) &(0xffffff))
#define GetSecondIndex(i) ((i)&0xff)

mpool* g_mp = NULL;

#define _256_BRANCH 256
#define BigBranchCnts (1<<24)

#define _FOUND_TAG_     (1)     // 已经找到的标记

#define  FIRST_LEFTBYTESSET_CNTS    (16 - 1)        // 首次分配的条目数


#define KEYWORD_MAX_LEN 64
// 保存关键字(除去前面二级索引所占去的字符)剩下的字符
struct LeftBytesSet {
    uint32_t Cnts;      // 已经保存的个数
    uint32_t totalCnts;
    char keySet[1][KEYWORD_MAX_LEN];                // 注意: 这个变长数组，一定要放结构体最后一个字段
};


struct LeftBytesSet*
getLeftBytesSet(mpool* mp, int size)
{
    struct LeftBytesSet* pLeftBytesSet =
        (struct LeftBytesSet*)mpool_alloc(mp, sizeof(struct LeftBytesSet) + (KEYWORD_MAX_LEN * (size-1)));

    pLeftBytesSet->totalCnts = size;
    pLeftBytesSet->Cnts = 0;

    return pLeftBytesSet;
}


// 第二层的Node
struct _2_Node {
    struct LeftBytesSet* child[_256_BRANCH];
    int findTag[_256_BRANCH];
};


// 第一层的Node
struct _1_Node {
    struct _2_Node* child[BigBranchCnts];
};


// 关键字树 信息
struct TreeInfo {
    mpool* mPool;
    struct _1_Node* root;   // 树的根  
    char bitmapMemBlock[2*1024*1024];
};


//
// 功能：指定字符串长度进行拷贝 （类似于c++ 的 strcpy_s 函数）
//
void myStrcpy_s(char* dst, char* src, int len)
{
    int i;
    for (i = 0; i < len; ++i) {
        dst[i] = src[i];
        }

    dst[len] = '\0';
}

// 功能：求关键字的长度
int myStrlen(char* str)
{
    int len = 0;
    while (*str) {
        if ('\r' == *str || '\n' == *str)
            break;

        len++;
        str++;
        }

    return len;
}


//
// 功能:  判断文件是否存在
//
// 返回值:     0:不存在，1:为文件夹, -1:为文件。
//
int isFileOrDir(char* path)     
{
    if (0 == access(path, 0))     
    {
        struct stat *buf;

        buf = (struct stat *) malloc(sizeof(struct stat));     
        memset(buf, 0, sizeof(struct stat));

        stat(path, buf);

        if (S_ISDIR(buf->st_mode))
        {
            free(buf);
            buf = NULL;
            return 1;     
        }     
        else         
        {     
            free(buf);
            buf = NULL;     
            return -1;
        }     
    }
    else    
    {     
        return 0;
    }
}


//
// 功能: 比较两个字符串(比较长度 为s1的长度，注意:不同于标准函数 strcmp )
//
inline int __attribute__((always_inline))
myStrcmp(char* s1, char* s2)
{
    while (*s1) {
        if (*s1 > *s2)
            return 1;

        if (*s1 < *s2)
            return -1;

        ++s1,++s2;      
        }

    return 0;
}


//
// 功能: 只根据utf-8第一个字节，得到 UTF-8 一个汉字(或字母)的字节长度
//
// 备注: UTF-8 原理可以参考
// http://note.youdao.com/noteshare?id=85958bf43d761398962af780b12392d6&sub=F79830034DF14F138B69146D19EB315B
// 
inline int32_t __attribute__((always_inline))
get_utf_8_word_length(char tmp)
{
    int len;

    // 这句的目的是判断 tmp的二进制是否是    1110****的形式，
    // 即判断是否为接下来的字节为 3。
    // 为什么这样做， 因为汉字大部分占3个字节。
    if (!( (tmp & 0xf0) ^ 0xe0))
        return 3;


    if (tmp > 0)    // 说明一个字节的最高位为 0，可判断为 ASCII
        return 1;


    len = 0;
    do {
        tmp <<= 1;
        len++;
    } while (tmp < 0);

    return len;
}


//  ---------------------------------
//  BITAMP 

inline void __attribute__((always_inline))
bitmap_set(char* bitmapMemBlock, int pos)
{
    bitmapMemBlock[pos>>3] |= (1 << (pos&7));
}

inline void __attribute__((always_inline))
bitmap_unset(char* bitmapMemBlock, int pos)
{
    bitmapMemBlock[pos>>3] &= ~(1 << (pos&7));
}


inline bool __attribute__((always_inline))
bitmap_get(char* bitmapMemBlock, int pos)
{
    return bitmapMemBlock[pos>>3] & (1 << (pos&7));
}
// -------------------------------------------


//
// 功能：插入关键字
//
inline void __attribute__((always_inline))
insertKeyword(struct TreeInfo* pTreeInfo, char* keyword, int keywordLen)
{
    const int leftLen = 4;                  // 记录第一级和第二级索引已经占用的字节数

    mpool* mp;
    struct _1_Node* root;
    struct _2_Node* p_2Node;
    struct LeftBytesSet* pLeftbytesSet, *pLeftbytesSet_2;

    uint32_t index;
    int Cnts;
    int i, j;

    root = pTreeInfo->root;
    mp = pTreeInfo->mPool;

    // 取第一个汉字字节(若超过3字节，就用3字节) 作为 第一级的索引
    if (keywordLen >= 3) {
        index = (*((uint32_t *)keyword)) & 0xffffff;
        }
    else if (2 == keywordLen) {
        //index = keyword[0] | (keyword[1]<<8);
        index = (*((uint32_t *)keyword)) & 0xffff;
        }
    else if (1 == keywordLen) {
        index = keyword[0]&0xff;
        }
    else
        return;

    if ((keywordLen - 3) <= 0) {
        bitmap_set(pTreeInfo->bitmapMemBlock, index);
        return;
        }

    p_2Node = root->child[index];
    if (NULL == p_2Node) {
        p_2Node =  root->child[index] = (struct _2_Node*)mpool_alloc(mp, sizeof(struct _2_Node));
        }

    // 得到第二级的索引
    index = keyword[3]&0xff;
    pLeftbytesSet = p_2Node->child[index];


    if ((keywordLen - 4) <= 0) {
        p_2Node->findTag[index] = _FOUND_TAG_;
        return;     
        }

    if (NULL == pLeftbytesSet) {
        pLeftbytesSet =  p_2Node->child[index] = 
            getLeftBytesSet(mp, FIRST_LEFTBYTESSET_CNTS);       
        }
    else if(0 == pLeftbytesSet->totalCnts - pLeftbytesSet->Cnts) {
        // 现在改用空间不足时，就成倍增长(类似于 c++ Vector 容器的做法)
        pLeftbytesSet_2 = 
            getLeftBytesSet(mp, pLeftbytesSet->totalCnts*2+1);

        // 拷贝原来的字符串到新分配的空间上
        for (i = 0; i < pLeftbytesSet->totalCnts; ++i) {
            strcpy(pLeftbytesSet_2->keySet[i], pLeftbytesSet->keySet[i]);
            pLeftbytesSet_2->Cnts ++;   
            }

        pLeftbytesSet = p_2Node->child[index] = pLeftbytesSet_2;
        //printf("扩大空间： %d \n", pLeftbytesSet->totalCnts);      
        }


    // 拷贝剩下的字符串到最后一级进行存储, 并进行从小到大的排序(方便查找时用二分法)
    //
    int beyondResult;   // 比较的结果值   
    Cnts = pLeftbytesSet->Cnts;
    for (i = 0; i < Cnts; ++i) {
        beyondResult = myStrcmp(keyword+leftLen, pLeftbytesSet->keySet[i]);

        if (0 > beyondResult) {
            break;
            }
        }

    for (j = Cnts; j > i; j--) {
        strcpy( pLeftbytesSet->keySet[j],  pLeftbytesSet->keySet[j-1]);
        }
    myStrcpy_s(pLeftbytesSet->keySet[i], keyword+leftLen, keywordLen-leftLen);

    pLeftbytesSet->Cnts++;
//  printf("cpystr= %s,    pLeftbytesSet->Cnts = %d\n ", keyword+leftLen, pLeftbytesSet->Cnts);
}


//
// 功能：删除关键字
// 
// 备注: 需要考虑到先是同时添加了"我们的"和"我"两个关键字，后来用户又删除了"我"这个关键字的情况。
//
//
void deleteKeyword(struct TreeInfo* pTreeInfo, char* keyword, int keywordLen)
{
    int cnts;
    int high, low, midle;           // 二分法查找
    struct _2_Node* p_2Node;
    struct LeftBytesSet* pLeftbytesSet;
    uint32_t index;

    if (keywordLen >= 3) {
        index =  (*((uint32_t *)keyword)) & 0xffffff;
        }
    else if (2 == keywordLen) {
        index =  (*((uint32_t *)keyword)) & 0xffff;
        }
    else if (1 == keywordLen) {
        index = keyword[0]&0xff;
        }
    else
        return;


    if ((keywordLen - 3) <= 0) {
        bitmap_unset(pTreeInfo->bitmapMemBlock, index);
        return;
        }

    p_2Node = pTreeInfo->root->child[index];
    if (NULL == p_2Node) 
        return;


    // 得到第二层对应的索引
    index = keyword[3] & 0xff;
    if ((keywordLen - 4) <= 0) {
        p_2Node->findTag[index] = 0;
        return;
        }

    pLeftbytesSet = p_2Node->child[index];
    if (NULL == pLeftbytesSet)
        return;


    // 遍历 第3层
    cnts = pLeftbytesSet->Cnts;
    high = cnts - 1;//假设数组是从小到大排列的
    low = 0;
    midle = cnts/2; 

    int beyondResult;
    while(high >= low) {
            midle = (high + low)/2;

            beyondResult = myStrcmp(pLeftbytesSet->keySet[midle], keyword+4);

            if(beyondResult > 0)
               high = midle - 1;
            else if(beyondResult < 0)
               low = midle + 1;
            else if(0 == beyondResult) { // 找到啦。。。
                strcpy(pLeftbytesSet->keySet[midle], pLeftbytesSet->keySet[cnts-1]);
                pLeftbytesSet->Cnts--;
                return;
                }
        }
}


//
// 功能： 查找关键字
//
inline bool __attribute__((always_inline))
findKeyword(struct TreeInfo* pTreeInfo, char* str)
{
    int cnts, high, low, midle; // 二分法查找
    int i;

    struct _1_Node* root;
    struct _2_Node* p_2Node;
    struct LeftBytesSet* pLeftbytesSet;
    uint32_t index;

    char tempCh;

    int beyondResult;   // 比较结果

    int first_utf_8_word_len;   // 标记查找时，一个汉字的长度(即占多少字节)

    root = pTreeInfo->root;

    while (*str) {

#if 0   //
        // 根据第一个字节得到 接下来的第一个汉字长度
        first_utf_8_word_len = get_utf_8_word_length(*str);

#else   // 这段代码等同于调用 get_utf_8_word_length() 函数， 但这里写比调用函数效率要高
        //
        //
        // 根据第一个字节得到 接下来的第一个汉字长度
        do {
                // 这句的目的是判断一个字符的二进制是否是    1110****的形式，
                // 即判断是否为接下来的汉字的字节为 3。
                // 为什么这样做?
                // 因为汉字大部分占3个字节，先判断可提前跳出 do-while(0)语句，程序运行效率变高。
                if (!((*str & 0xf0) ^ 0xe0)) {
                    first_utf_8_word_len = 3;
                    break;
                    }               

                if (*str > 0) { // 说明一个字节的最高位为 0，可判断为 ASCII
                    first_utf_8_word_len = 1;
                    break;
                    }

                first_utf_8_word_len = 0;
                tempCh = *str;
                do {
                    tempCh <<= 1;
                    first_utf_8_word_len++;
                } while (tempCh < 0);               
            } while(0);
#endif

        // 这里为什么这样写?  
        // 假设 str[1]=='\0' 那就到第一个字节就够了。 至于 str[2] 是否为0,跟取前3个字节作为索引都是一样的。
        if (str[1])
            index =  (*((uint32_t *)str)) & 0xffffff;
        else 
            index =  (*((uint32_t *)str)) & 0xff;

        p_2Node = root->child[index];

        if ((pTreeInfo->bitmapMemBlock[index>>3] & (1 << (index&7)))) {
#ifdef _TEST_
            g_findKeyword = str;
            g_findKeyword_Len = (str[1]) ? 3 :2;
#endif
            return true;
            }

        if (NULL == p_2Node)
            goto mytag;


        // 得到第二层对应的索引
        index = str[3] & 0xff;  
        if (_FOUND_TAG_ == p_2Node->findTag[index]) {
#ifdef _TEST_
            g_findKeyword = str;
            g_findKeyword_Len = 4;
#endif      
            return true;
            }

        pLeftbytesSet = p_2Node->child[index];
        if (NULL == pLeftbytesSet)
            goto mytag;

        // 遍历 第3层
        cnts = pLeftbytesSet->Cnts;

    // 当条目数很少时，就直接遍历(而不用二分法)
    if (cnts <= 7) {
        for (i = 0; i < cnts; ++i) {
            if(0 ==myStrcmp(pLeftbytesSet->keySet[i],  str+4)) { // 找到啦。。。
#ifdef _TEST_
                    g_findKeyword = str;
                    g_findKeyword_Len = 4 + strlen(pLeftbytesSet->keySet[i]);
#endif
                    return true;
                }
            }
        }
    else { // 二分法查找     
        high = cnts - 1;//假设数组是从小到大排列的
        low = 0;
        midle = cnts/2;
        while(high >= low) {
                midle = (high + low)/2;

                beyondResult = myStrcmp(pLeftbytesSet->keySet[midle],  str+4);

                if(beyondResult > 0)
                   high = midle - 1;
                else if(beyondResult < 0)
                   low = midle + 1;
                else if(0 == beyondResult) { // 找到啦。。。
#ifdef _TEST_
                    g_findKeyword = str;
                    g_findKeyword_Len = 4 + strlen(pLeftbytesSet->keySet[midle]);
#endif
                    return true;
                    }
            } 
        }


mytag:
        str += first_utf_8_word_len;    // 查找完一轮失败，就跳过一个汉字(或字母)的长度
        }

    return false;
}


struct TreeInfo* InitTreeInfo()
{   
    struct TreeInfo* pTreeInfo = (struct TreeInfo*)malloc(sizeof(struct TreeInfo));

    pTreeInfo->mPool = mpool_init(1024*1024*128, 8);            // 初始化内存池
    pTreeInfo->root =  mpool_alloc(pTreeInfo->mPool, sizeof(struct _1_Node));
    memset(pTreeInfo->root, 0, sizeof(struct _1_Node));

    memset(pTreeInfo->bitmapMemBlock, 0, 2*1024*1024);

    return pTreeInfo;   
}


void deinitTreeInfo(struct TreeInfo* pTreeInfo)
{
    mpool_destroy(pTreeInfo->mPool);
}


//
// 功能: 测试性能
//
int testPerformance(int argc, char* argv[])
{
#define MAX_KEY_CNTS    256*1024                // 关键词条数最大值
#define MAX_KEY_LEN     64                      // 关键字最大的长度

    long long begTime, endTime, totalTime;
    struct ACAutoMation *mation;
    char (*StrLine)[MAX_KEY_LEN];           // 保存关键词条目内容
    int key_cnts    = 0;                    // 关键词条目数                   
    int lSize;                              // 文本文件大小
    char *text;                             // 测试文本内容
    int len;
    FILE *fTxt; 
    FILE *fDic;
    char ch;  
    int32_t findResult = 0;
    int i;

    char tmpFindKeyword[64] = {0};

    struct TreeInfo *pTreeInfo = InitTreeInfo();

    StrLine = (char(*)[MAX_KEY_LEN])malloc(MAX_KEY_LEN * MAX_KEY_CNTS);
    if (NULL == StrLine) {
        printf("分配内存失败。。。 \n");
        exit(1);
        }

    if(argc < 3) {
        printf("请输入参数:     如：  ./test text.txt dictionary.txt\n");
        exit(1);
        }

    if ((-1 != isFileOrDir(argv[1])) || (-1 != isFileOrDir(argv[2]))) {
        printf("参数传入的文件不存在，请正确输入参数:       如：  ./test dictionary.txt  text.txt\n");
        exit(1);
        }

    if((fTxt=fopen(argv[1], "r")) == NULL) {  
        printf("file cannot be opened\n");   
        exit(1);
    }

    if((fDic=fopen(argv[2], "r")) == NULL) {  
        printf("file cannot be opened\n");   
        exit(1);
    }

    fseek(fTxt, 3, SEEK_SET); // 跳过开头的三个字节的  BOM 头
    fseek(fDic, 3, SEEK_SET); // 跳过开头的三个字节的  BOM 头


    // 读取 Key 字典文件
    while (!feof(fDic) && (key_cnts < MAX_KEY_CNTS)) { 
      fgets(StrLine[key_cnts], MAX_KEY_LEN, fDic);  //读取一行

     // trim(StrLine[key_cnts]);            //去掉行末换行符
      len = myStrlen(StrLine[key_cnts]);    

      //printf("len = %d\n", len);

      if (0 != len) {
        ++key_cnts;  
        }
    } 
    fclose(fDic);


    printf("构建树   开始 ... \n");
    begTime = getSystemTime();
    for (i = 0; i < key_cnts; ++i) {
        len = myStrlen(StrLine[i]);
        insertKeyword(pTreeInfo, StrLine[i], len);  
        }
    endTime = getSystemTime();  
    printf("构建树 结束 ... 耗时为 %lld (ms)\n",  (endTime - begTime > 0) ? (endTime - begTime) : 1);


    // ----------------------------------------------------------------------
    begTime = getSystemTime();  
    // 读取整个要查找的文件
    fseek(fTxt, 0, SEEK_END);
    lSize = ftell(fTxt);
    text = (char*)malloc(lSize+1);
    fseek(fTxt, 0, SEEK_SET);   // rewind
    fread(text, sizeof(char), lSize, fTxt);
    text[lSize] = '\0';
    fclose(fTxt);

    findResult = findKeyword(pTreeInfo, text);
    endTime = getSystemTime();


    printf("------------------------\n\n");

    if (findResult > 0) {
        myStrcpy_s(tmpFindKeyword, g_findKeyword, g_findKeyword_Len);

        printf("找到关键字 =   (%s) \n", tmpFindKeyword);
        } 
    else {
        printf("没找到关键字！\n");
    }

    totalTime = (endTime - begTime > 0) ? (endTime - begTime) : 1;
    printf("关键字条目数: %d, 查找的文件大小: %d , 查找耗时: %lld ms, 查找速率: %lld (bytes/ms) \n", 
                key_cnts, lSize, totalTime, (lSize)/(int)(totalTime));

    deinitTreeInfo(pTreeInfo);
    return 0;
}


//
// 功能: 测试删除关键字函数
//
void test()
{
    struct TreeInfo *pTreeInfo = InitTreeInfo();
    char tmpFindKeyword[64] = {0};
    char* text = "kfk测试多几次fkfabca32bxa";

    //insertKeyword(pTreeInfo, "a", strlen("a"));
    insertKeyword(pTreeInfo, "ab", strlen("ab"));
    insertKeyword(pTreeInfo, "abx", strlen("abx"));
    insertKeyword(pTreeInfo, "测试多几次", strlen("测试多几次"));


    if ( findKeyword(pTreeInfo, text) > 0){
        myStrcpy_s(tmpFindKeyword, g_findKeyword, g_findKeyword_Len);

        printf("找到关键字 =   (%s) \n", tmpFindKeyword);
        } 
    else {
        printf("没找到关键字！\n");
    }

    printf("\n---->  演示一下删除关键字 ----- \n\n");
    char* delKeyword =  "测试多几次";
    deleteKeyword(pTreeInfo, delKeyword, strlen(delKeyword));

    printf("删除关键字：( %s)\n 重新查找...\n \n", delKeyword);

    if ( findKeyword(pTreeInfo, text) > 0){
        myStrcpy_s(tmpFindKeyword, g_findKeyword, g_findKeyword_Len);

        printf("找到关键字 =   (%s) \n", tmpFindKeyword);
        } 
    else {
        printf("没找到关键字！\n");
    }

    deinitTreeInfo(pTreeInfo);
}


int main(int argc, char* argv[])
{
    // 测试性能
    testPerformance(argc, argv);

    //test();

    return 0;
}

mpool.h

/*******************************************************
内存池管理：
    目的：为了减少内存碎片化及每次申请内存消耗时间多，减少 Cache miss。
                                    2017.04.08
*******************************************************/
#ifndef MPOOL_H
#define MPOOL_H

#include <stdio.h>
//#include <stdlib.h>

#ifdef _MSC_VER // windows 操作系统
#define INLINE(RETURN_TYPE) RETURN_TYPE 
#else
#define INLINE(RETURN_TYPE) \
    static inline RETURN_TYPE __attribute__((always_inline))
#endif

#define M_ALIGNMENT     8       // 指定内存对齐字节
#pragma pack(8)

#ifndef MPOOL_MALLOC
#define MPOOL_MALLOC(sz) malloc(sz)
#define MPOOL_REALLOC(p, sz) realloc(p, sz)
#define MPOOL_FREE(p, sz) free(p)
#endif

// 内存块信息
typedef struct {
    void* pBlock;           // 保存内存块地址
    void* pCur;             // 指向当前块内存还未使用的内存块首地址
    int remainderSize;      // 当前内存块还剩下未用的内存大小
} mBlockInfo;

// 内存池信息
typedef struct {
    unsigned int defaultBlockSize;          // 用户指定分配的每一块大小（字节），注意这个值只是用户指定的希望值，真实分配到的内存大小未必就是这个大小。
                                            // 具体做法是先按用户这个值分配，如果失败就减半分配; 如果减半方式，直到分配成功！
    unsigned int nAllocBytes;               // 已经分配了多少内存
    unsigned int blockCnts;                 // 已分配的内存块数

    mBlockInfo mBlockInfoArray[1];          // 用变长数组，保存分配好的内存块集合信息  
    // 注意：不要在这个变长数组下面添加数据
} mpool;

// 功能： 初始化内存池，指定内存块的大小
// 参数： nBlockSize - 指定每块内存块的大小
//       maxBlockCnts - 分配的内存最大的块数（注意：没有用链接动态管理，直接用数组保存已分配内存指针）
mpool *mpool_init(unsigned int nBlockSize, unsigned int maxBlockCnts);

void* mpool_alloc(mpool* mp, int nBytes);
void mpool_destroy(mpool* mp);
#endif

mpool.c

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include "mpool.h"


// 得到页大小，一般为 4096.
long getPageSize()
{
    return 4096;
//#ifdef _MSC_VER   // windows 操作系统
//  return 4096;
//#else         // Linux 操作系统
//#include <unistd.h>
//long pgsz = sysconf(_SC_PAGESIZE);
//return pasz;
//#endif
}

// 功能： 初始化内存池，指定内存块的大小
// 参数： nBlockSize - 指定每块内存块的大小
//       maxBlockCnts - 分配的内存最大的块数（注意：没有用链接动态管理，直接用数组保存已分配内存指针）

mpool *
mpool_init(unsigned int nBlockSize, unsigned int maxBlockCnts /* = 100*/)
{
    printf("test\n");

    mpool * pMpool = (mpool*)malloc(sizeof(mpool) + (maxBlockCnts - 1) * sizeof(mBlockInfo));

    long pgsz = getPageSize();  // 获得系统内存页大小
    pMpool->defaultBlockSize = pgsz * ((nBlockSize + pgsz - 1) / pgsz); // 这里调整指定大小，使其是内存页面大小的整数倍
    pMpool->blockCnts = 0;
    pMpool->nAllocBytes = 0;

    return pMpool;
}


void*
 mpool_alloc(mpool* mp, int nBytes)
{
    assert(mp);
    void* retVal = NULL;    // 返回值
    void* pAlloc = NULL;
    int i, tmp;
    unsigned int blockCnts = mp->blockCnts;
    unsigned int allocBlockSize = mp->defaultBlockSize;
    unsigned int ActualBytes = M_ALIGNMENT * ((nBytes + M_ALIGNMENT - 1) / M_ALIGNMENT);    // 计算从内存块划分内存（注意，这里有调整到对齐内存，而非直接用 nBytes）
    mBlockInfo* pBlockInfo = NULL;

    if (mp->defaultBlockSize < nBytes)
    {
        printf("指定分配的内存大小 > 指定的内存块大小\n");
        return NULL;
    }

    // 首先遍历已分配的内存块，从中获取合适的内存块大小
    for (i = 0; i < blockCnts; ++i)
    {
        if (mp->mBlockInfoArray[i].remainderSize >= ActualBytes)
        {
        //  printf("从内存池中获取内存。。。。\n");

            pBlockInfo = &(mp->mBlockInfoArray[i]);
            retVal = pBlockInfo->pCur;

             pBlockInfo->remainderSize -= ActualBytes;

            pBlockInfo->pCur = (void*)((char*)pBlockInfo->pCur + ActualBytes);


            break;
        }
    }

    // 如果都没有合适的内存可用，才进行真正分配内存。
    if (NULL == retVal)
    {
        //printf("新分配内存\n");

        do 
        {
            retVal = malloc(allocBlockSize);
            if (NULL != retVal)
            {
                mp->nAllocBytes += allocBlockSize;

                break;
            }

            //printf("减半分配， 原来 %d, 变成 %d\n", allocBlockSize, allocBlockSize / 2);

            allocBlockSize /= 2;

            if (0 == allocBlockSize)
            {
                break;
            }

        } while (1);

        if (NULL != retVal)
        {
            pBlockInfo = &(mp->mBlockInfoArray[mp->blockCnts]);

            pBlockInfo->pBlock = retVal;

            pBlockInfo->remainderSize = allocBlockSize;
            pBlockInfo->pCur = retVal;

            if (allocBlockSize >= ActualBytes)
            {
                pBlockInfo->remainderSize -= ActualBytes;

                pBlockInfo->pCur = (void*)((char*)pBlockInfo->pCur + ActualBytes);
            }
            else
            {
                //printf("mpool_init: allocBlockSize = %d , ActualBytes = %d... 9\n", allocBlockSize, ActualBytes);
                retVal = NULL;
            }

            mp->blockCnts++;
        }
    }

    if (NULL == retVal)
    {
        printf("ffk\n");
    }

    return retVal;
}


void
mpool_destroy(mpool* mp)
{
    int i;
    unsigned int blockCnts = mp->blockCnts;
    for (i = 0; i < blockCnts; ++i)
    {
        free(mp->mBlockInfoArray[i].pBlock);
    }

    free(mp);

    mp = NULL;
}

getTime.h

#ifndef GETTIME_H
#define GETTIME_H

#ifdef _MSC_VER // windows 操作系统

#include <ctime>
#include <windows.h>
#include <stdarg.h>

// 得到系统时间
long long getSystemTime()
{
    return GetTickCount();
}

#else           // Linux 操作系统

#include <sys/time.h>
#include <unistd.h>// sleep(3);
#include <sys/timeb.h>//timeb

// 得到系统时间
 long long getSystemTime() 
{ 
    struct timeb t;
    ftime(&t);
    return 1000 * t.time + t.millitm;
}
#endif

#endif

Makefile

main:
        gcc mpool.c getTime.h FindKeyword.c -o findKeyword -O3
clean: 
        rm -f findKeyword

1.多模式匹配
2. 常用算法

W_SX12553

关注

2
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
关键字查找

中文关键字查找(敏感词过滤)背景近来需要在极短的时间来查找某一段文字是否出现关键字(敏感字)的应用。网上找了一些资料，有用java写的双数组AC树，达到每秒27Mb 的速度。用c写的ac树每秒也只有 30M。以下是我用多叉树 c 语言写的，查找速度85~92Mb/s主要代码如下：//// 功能：测试一下树，2^24叉树作为第一级,进行敏感词过滤//////
复制链接

扫一扫