以前做ASP.NET的时候接触过敏感词屏蔽,在c#那种强大的框架下,直接用切词工具再加一个敏感词字典就搞定一切。回到c++,我肯定不会因为搞个敏感词就去引入个框架,第一追求效率,第二看别人写的c++代码也挺痛苦的。接下来,就讲解下对于敏感词屏蔽的具体思路与代码。
首先要解决的问题是敏感词的存储形式,这就涉及数据结构,先想想搜索屏蔽要怎么处理,比如我有一个content,我就遍历它每个字符,先看与词典中所有词第一个字符相同的,再看第二个,再看第三个.等等。那么,很明显,这就需要一种以层来存储的数据结构--树来存储敏感词汇。我首先设计了一个Node,它要存储同一级的node指针,下一级的node指针,标识词的结束,数据。最开始本来只想到用树的结构,最后发现, 这居然就是个二叉树,可以抽象成左边是next,右边就是sibling,那问题就简单了。
我的代码实现非常简单明了,而且中文那些都完全没问题,不像其他人弄的那么复杂,而且随便测试,毫无BUG
#include<iostream>
#include<fstream>
#include<sstream>
#include<list>
#include<cstring>
using namespace std;
struct Node{
char data;
bool last;
Node* sibling;
Node* next;
};
void GetKeyWordList(list<char*>& list_keyword){
ifstream in;
stringstream ss;
in.open("keyword.txt");//read keyword all text
ss<<in.rdbuf();
in.close();
char content[ss.str().length()+1];
memcpy(content,ss.str().c_str(),ss.str().length());
content[ss.str().length()]=0;
list<char*>::iterator it_keyword;
char* temp=strtok(content,"@");//delim is @
char* temp2=0;
while(temp!=0){
bool exist=false;
for(list<char*>::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){
if(strcmp(*it_keyword,temp)==0){
exist=true;
break;
}
}
if(!exist){//push keyword to list except the repeated keyword
temp2=new char[strlen(temp)+1];
memcpy(temp2,temp,strlen(temp));
temp2[strlen(temp)]=0;
list_keyword.push_back(temp2);
}
temp=strtok(0,"@");
}
}
void ReleaseKeyWordList(list<char*>& list_keyword){
for(list<char*>::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){
delete *it_keyword;
}
}
Node* NodeHasExist(Node* cur,char data){
while(cur!=0){
if(cur->data==data){
return cur;
}
cur=cur->sibling;
}
return 0;
}
Node* BuildTree(){
list<char*> list_keyword;
GetKeyWordList(list_keyword);
Node* root=new Node();//make keyword tree by list
root->next=0;
root->sibling=0;
root->data=0;
Node* cur=root;
for(list<char*>::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){//traversal list
int length=strlen(*it_keyword);
for(int i=0;i<length;i++){//visit each char
Node* node=NodeHasExist(cur->next,(*it_keyword)[i]);//check char node does exist in cur->next link list
if(node==0){
node=new Node();
node->data=(*it_keyword)[i];
node->last=false;
node->next=0;
node->sibling=0;
if(cur->next==0){
cur->next=node;//make node become cur->next's first
}
else{
Node* temp=cur->next;
while(temp->sibling!=0){//make point move to cur->next's last
temp=temp->sibling;
}
temp->sibling=node;//add node at cur-next's last
}
}
//if node exist,skip create node again
cur=node;//make cur point at node
}
cur->last=true;//if is the end of word,set sign at the last,because it may has many branches
cur=root;
}
ReleaseKeyWordList(list_keyword);
return root;
}
void ReleaseTree(Node* root){//post traversal delete
if(root!=0){
ReleaseTree(root->next);
ReleaseTree(root->sibling);
delete root;
}
}
void LookUp(char* content,Node* root){
Node* cur=root->next;
int start=-1;
stringstream ss;
bool shouldReview=false;
for(int i=0;i<strlen(content);i++){//traversal content
bool hasfind=false;//if hasfind,find next char
char ch=content[i];
while(cur!=0){
if(cur->data==ch){
hasfind=true;
if(start==-1){//first time find char ,record first time index
start=i;
}
if(cur->last&&cur->next==0){//it has arrived at last branch
start=-1;
ss<<"*";
cur=root->next;
shouldReview=false;
}
else if(cur->last){//it may not arrived at last branch
start=i+1;
ss<<"*";
cur=cur->next;
shouldReview=true;
}
else{
cur=cur->next;
}
break;
}
cur=cur->sibling;
}
if(shouldReview&&!hasfind){
i=start-1;//this time is checking cur->next,it should also check root->next,start-1 because of i++
shouldReview=false;
start=-1;
cur=root->next;
}
else if(!hasfind){
if(start>-1){
i=start;//start must have checked with root->next,so it was not used to be -1
}
ss<<content[i];
start=-1;
cur=root->next;
}
}
if(start>-1&&!shouldReview){
for(int j=start;j<strlen(content);j++){
ss<<content[j];
}
}
cout<<ss.str()<<endl;
}
int main(){
Node* root=BuildTree();
while(true){
char input[128];
cin.getline(input,128);
if(strcmp(input,"q!")==0){
break;
}
clock_t start=clock();
LookUp(input,root);
clock_t end=clock();
cout<<"¹ýÂ˺Äʱ£º"<<end-start<<" ms"<<endl<<endl;
}
ReleaseTree(root);
return 0;
}
测试结果: