如何设计高效的聊天过滤词算法?

关于聊天过滤词算法,一直困扰着我,了解到很多算法,比如:KMP, 正则循环匹配等,然后在http://www.dewen.org/q/41/%E5%A6%82%E4%BD%95%E8%AE%BE%E8%AE%A1%E9%AB%98%E6%95%88%E7%9A%84%E8%81%8A%E5%A4%A9%E8%BF%87%E6%BB%A4%E8%AF%8D%E7%AE%97%E6%B3%95%EF%BC%9F看到了一篇文章,现摘要几种相对好的答案,以备不时之需。

1trie树算法

我们的解决方法是用构造一个tire树。 每个节点都存储0- 256个字符。

用脏词字典来构造这个树。

树的结构大概如下

具体实现代码如下:

namespace KGame
{
class WordFilter
{
public:
    WordFilter() {}

    ~WordFilter() 
    {
        Clean(&m_Filter);
    }

    void AddWord(const char* word)
    {
        UInt32 len = (UInt32)strlen(word);
        Filter* filter = &m_Filter;

        for (UInt32 i = 0; i < len; i++)
        {
            unsigned char c = word[i];
            if (i == len - 1)
            {
                filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_IS_END;
                break;
            }
            else
            {
                filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_HAS_NEXT;
            }

            if (filter->m_NodeArray[c].m_NextFilter == NULL)
            {
                Filter* tmpFilter = XNEW (Filter)();
                filter->m_NodeArray[c].m_NextFilter = tmpFilter;
            }

            filter = (Filter *)filter->m_NodeArray[c].m_NextFilter;
        }
    }

    void AddWords(const std::set<std::string>& wordList)
    {
        for (std::set<std::string>::const_iterator it = wordList.begin();
            it != wordList.end(); it++)
        {
            AddWord(it->c_str());
        }
    }

    void AddWords(const std::vector<std::string>& wordList)
    {
        for (std::vector<std::string>::const_iterator it = wordList.begin();
            it != wordList.end(); it++)
        {
            AddWord(it->c_str());
        }
    }

    void AddWords(const KGame::Set<std::string>& worldList)
    {
        for (KGame::Set<std::string>::Iter* iter = worldList.Begin();
            iter != worldList.End(); iter = worldList.Next(iter))
        {
            AddWord(iter->m_Value.c_str());
        }
    }

    Int32 Check(const char* str)
    {
        Filter* filter = NULL;
        for (Int32 i = 0; i < (int)strlen(str) - 1; i++)
        {
            filter = &m_Filter;
            for (UInt32 j = i; j < strlen(str); j++)
            {
                unsigned char c = str[j]; 
                if ((c >= 'A' && c <= 'Z'))
                {
                    c += 32;
                }

                if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL)
                {
                    break;
                }
                else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END)
                {
                    return i;
                }
                else // NODE_HAS_NEXT
                {
                    filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;
                }
            }
        }
        return -1;
    }

    void CheckAndModify(char* str, const char replace = '*')
    {
        Filter* filter = NULL;
        for (Int32 i = 0; i < (int)strlen(str) - 1; i++)
        {
            filter = &m_Filter;
            for (UInt32 j = i; j < strlen(str); j++)
            {
                unsigned char c = str[j]; 
                if ((c >= 'A' && c <= 'Z'))
                {
                    c += 32;
                }

                if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL)
                {
                    break;
                }
                else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END)
                {
                    for (UInt32 k = i; k <= j; k++)
                    {
                        str[k] = replace;
                    }

                    if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT)
                    {
                        filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;
                    }
                    else
                    {
                        continue;
                    }
                }
                else // NODE_HAS_NEXT
                {
                    filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;
                }
            }
        }
    }

    void CheckAndModify(std::string& str, const char replace = '*')
    {
        Filter* filter = NULL;
        for (Int32 i = 0; i < (int)str.size() - 1; i++)
        {
            filter = &m_Filter;
            for (UInt32 j = i; j < str.size(); j++)
            {
                unsigned char c = str[j]; 
                if ((c >= 'A' && c <= 'Z'))
                {
                    c += 32;
                }
                if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL)
                {
                    break;
                }
                else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END)
                {
                    for (UInt32 k = i; k <= j; k++)
                    {
                        str[k] = replace;
                    }

                    if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT)
                    {
                        filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;
                    }
                    else
                    {
                        continue;
                    }
                }
                else // NODE_HAS_NEXT
                {
                    filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;
                }
            }
        }
    }

private:

    struct FilterNode
    {
        char m_Flag;
        void* m_NextFilter;

        enum Flag
        {
            NODE_IS_NULL          = 0x00,
            NODE_HAS_NEXT         = 0x01,
            NODE_IS_END           = 0x10,
        };
        FilterNode() : m_Flag(NODE_IS_NULL), m_NextFilter(NULL) {}
    };

    struct Filter
    {
        FilterNode m_NodeArray[256];
    } m_Filter;

    void Clean(Filter* filter)
    {
        for (UInt32 i = 0; i < 256; i++)
        {
            if (filter->m_NodeArray[i].m_NextFilter)
            {
                Clean((Filter *)filter->m_NodeArray[i].m_NextFilter);
                XDELETE((Filter*)filter->m_NodeArray[i].m_NextFilter);
            }
        }
    }
};
} // namespace KGame

2.基于KMP算法

聊天过滤词算法的解决思路
提高过滤的算法个人认为主要从两个方面考虑:(1)尽量减少内存、IO的次数。(2)增加串内查找的速度。
基于这两点我想采用连续的内存片,可以减少内存地址跳跃的次数,采用静态的内存这就解决了(1)的问题,第二点是增加串内查找的速度,这个比较公认的事KMP算法

class WordFilter
{
public:
WordFilter();
~WordFilter();

void Init();
void FilterWord(string& word);
int  Index_KMP(const char* S, const char* T, int pos);

private:
std::set<string> m_storage;
const char** m_words;
uint32 m_count;
};

WordFilter::WordFilter()
{
m_words = NULL;
m_count = 0;
}

WordFilter::~WordFilter()
{
if(m_words) {
free(m_words);
}
}

void WordFilter::Init()
{
// 把所有屏蔽词都放到m_storage里
m_count = m_storage.size();
if(m_count) {
m_words = (const char**)malloc(sizeof(char*)*m_count);
std::set<string>::iterator ptr;
int i = 0;
for(ptr = m_storage.begin(); ptr != m_storage.end(); ++ptr,i++) {
m_words[i] = ptr->c_str();
}
}
}

static inline void _filterWord(char* word, const char* lowerWord, const char* oldstr)
{
int len = strlen(oldstr);
const char* tmp;
memset(word, '*', len);
word += len;
lowerWord += len;

while((tmp = Index_KMP(lowerWord, oldstr)) != NULL) {
word += (tmp-lowerWord);
memset(word, '*', len);
word += len;
lowerWord = tmp + len;
}
}

void WordFilter::FilterWord(string& word)
{
string tmp(word);
str_tolower(tmp);
const char** p = (const char**)m_words;
const char* dest;
for(uint32 i=0; i<m_count; i++, p++) {
dest = Index_KMP(tmp.c_str(), *p, 0);
if(dest) {
_filterWord((char*)(word.c_str() + (dest-tmp.c_str())), dest, *p);
}
}
}

int WordFilter::Index_KMP(const char* S, const char* T, int pos){    
i=pos;   j=1;  
while(i <= S[0] && j<= T[0]){  
if(j == 0 || S[i] == T[j]) { ++i; ++j; }
else   j = next[j];
}  

if(j>T[0])   
return   i-T[0]; 
else   
return   0;  
}

以上两种方法相对比较好一点。以做参考。


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值