C++ 敏感词屏蔽

最新推荐文章于 2024-05-12 20:01:39 发布

lld951027

最新推荐文章于 2024-05-12 20:01:39 发布

阅读量5.6k

点赞数 6

分类专栏： C/C++ 文章标签： C++ 算法

本文链接：https://blog.csdn.net/nightwizard2030/article/details/70544697

版权

C/C++ 专栏收录该内容

66 篇文章 6 订阅

订阅专栏

以前做ASP.NET的时候接触过敏感词屏蔽，在c#那种强大的框架下，直接用切词工具再加一个敏感词字典就搞定一切。回到c++，我肯定不会因为搞个敏感词就去引入个框架，第一追求效率，第二看别人写的c++代码也挺痛苦的。接下来，就讲解下对于敏感词屏蔽的具体思路与代码。

首先要解决的问题是敏感词的存储形式，这就涉及数据结构，先想想搜索屏蔽要怎么处理，比如我有一个content，我就遍历它每个字符，先看与词典中所有词第一个字符相同的，再看第二个，再看第三个.等等。那么，很明显，这就需要一种以层来存储的数据结构--树来存储敏感词汇。我首先设计了一个Node，它要存储同一级的node指针，下一级的node指针，标识词的结束，数据。最开始本来只想到用树的结构，最后发现，这居然就是个二叉树，可以抽象成左边是next，右边就是sibling，那问题就简单了。

我的代码实现非常简单明了，而且中文那些都完全没问题，不像其他人弄的那么复杂，而且随便测试，毫无BUG

#include<iostream>
#include<fstream>
#include<sstream>
#include<list>
#include<cstring>
using namespace std;

struct Node{
	char data;
	bool last;
	Node* sibling;
	Node* next;
};

void GetKeyWordList(list<char*>& list_keyword){
	ifstream in;
	stringstream ss;
	in.open("keyword.txt");//read keyword all text
	ss<<in.rdbuf();
	in.close();
	
	char content[ss.str().length()+1];
	memcpy(content,ss.str().c_str(),ss.str().length());
	content[ss.str().length()]=0;

	list<char*>::iterator it_keyword;
	
	char* temp=strtok(content,"@");//delim is @
	char* temp2=0;
	while(temp!=0){
		bool exist=false;
		for(list<char*>::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){
			if(strcmp(*it_keyword,temp)==0){
				exist=true;
				break;
			}
		}
		if(!exist){//push keyword to list except the repeated keyword
			temp2=new char[strlen(temp)+1];
			memcpy(temp2,temp,strlen(temp));
			temp2[strlen(temp)]=0;
			list_keyword.push_back(temp2);
		}
		temp=strtok(0,"@");
	}
}

void ReleaseKeyWordList(list<char*>& list_keyword){
	for(list<char*>::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){
		delete *it_keyword;
	}
}

Node* NodeHasExist(Node* cur,char data){
	while(cur!=0){
		if(cur->data==data){
			return cur;
		}
		cur=cur->sibling;
	}
	return 0;
}


Node* BuildTree(){
	list<char*> list_keyword;
	
	GetKeyWordList(list_keyword);
	
	Node* root=new Node();//make keyword tree by list
	root->next=0;
	root->sibling=0;
	root->data=0;
	Node* cur=root;
	for(list<char*>::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){//traversal list 
		int length=strlen(*it_keyword);
		for(int i=0;i<length;i++){//visit each char
			
			Node* node=NodeHasExist(cur->next,(*it_keyword)[i]);//check char node does exist in cur->next link list
			
			if(node==0){
				node=new Node();
				node->data=(*it_keyword)[i];
				node->last=false;
				node->next=0;
				node->sibling=0;
				if(cur->next==0){
					cur->next=node;//make node become cur->next's first
				}
				else{
					Node* temp=cur->next;
					while(temp->sibling!=0){//make point move to cur->next's last
						temp=temp->sibling;
					}
					temp->sibling=node;//add node at cur-next's last
				}			
			}
			
			//if node exist,skip create node again

			cur=node;//make cur point at node 
		}
		cur->last=true;//if is the end of word,set sign at the last,because it may has many branches
		cur=root;
	}
	
	ReleaseKeyWordList(list_keyword);
	
	return root;
}

void ReleaseTree(Node* root){//post traversal delete
	if(root!=0){
		ReleaseTree(root->next);
		ReleaseTree(root->sibling); 
		delete root;
	}
}                 
	                   
void LookUp(char* content,Node* root){
	Node* cur=root->next;
	int start=-1;
	stringstream ss;
	bool shouldReview=false;
	for(int i=0;i<strlen(content);i++){//traversal content
	
		bool hasfind=false;//if hasfind,find next char
		
		char ch=content[i];
		
		while(cur!=0){
			if(cur->data==ch){
				hasfind=true;
				if(start==-1){//first time find char ,record first time index
					start=i;
				}
				if(cur->last&&cur->next==0){//it has arrived at last branch
					start=-1;
					ss<<"*";
					cur=root->next;
					shouldReview=false; 
				}
				else if(cur->last){//it may not arrived at last branch
					start=i+1;
					ss<<"*";
					cur=cur->next;
					shouldReview=true;
				}
				else{
					cur=cur->next;
				}
				break;
			}
			cur=cur->sibling;
		}
		
		if(shouldReview&&!hasfind){
			i=start-1;//this time is checking cur->next,it should also check root->next,start-1 because of i++
			shouldReview=false;
			start=-1;
			cur=root->next;
		}
		else if(!hasfind){
			if(start>-1){
				i=start;//start must have checked with root->next,so it was not used to be -1
			}
			ss<<content[i];
			start=-1;
			cur=root->next;
		}
	}
	
	if(start>-1&&!shouldReview){
		for(int j=start;j<strlen(content);j++){
			ss<<content[j];
		}
	}
	cout<<ss.str()<<endl;
}


int main(){
	Node* root=BuildTree();

	while(true){
		char input[128];
		cin.getline(input,128);
		
		if(strcmp(input,"q!")==0){
			break;
		}
		
		clock_t start=clock();
		LookUp(input,root);
		clock_t end=clock();
		
		cout<<"¹ýÂËºÄÊ±£º"<<end-start<<" ms"<<endl<<endl;
	}
	
	ReleaseTree(root);

	return 0;
}

敏感词库：

测试结果：

lld951027

关注

6
点赞
踩
25

收藏

觉得还不错? 一键收藏
8
评论
C++ 敏感词屏蔽

以前做ASP.NET的时候接触过敏感词屏蔽，在c#那种强大的框架下，直接用切词工具再加一个敏感词字典就搞定一切。回到c++，我肯定不会因为搞个敏感词就去引入个框架，第一追求效率，第二看别人写的c++代码也挺痛苦的。接下来，就讲解下对于敏感词屏蔽的具体思路与代码。首先要解决的问题是敏感词的存储形式，这就涉及数据结构，先想想搜索屏蔽要怎么处理，比如我有一个content，我就遍历它每个字符
复制链接

扫一扫