DirtyWordsFilter(脏字过滤)

yunteng521

已于 2023-05-24 09:02:02 修改

阅读量336

点赞数

CC 4.0 BY-SA版权

分类专栏：杂项 c++ go 文章标签：脏字过滤 c++ golang DirtyWords

于 2023-05-23 17:37:14 首次发布

本文链接：https://blog.csdn.net/yunteng521/article/details/130830881

杂项同时被 3 个专栏收录

17 篇文章

订阅专栏

c++

6 篇文章

订阅专栏

4 篇文章

订阅专栏

文章介绍了如何使用C++和Golang实现基于256tree的数据结构来过滤脏字。C++版本包含`dirtywords.h`和`dirtywords.cpp`，而Golang版本是`dirtyword.go`。代码实现了加载脏字列表，检测和过滤脏字的功能。示例代码展示了如何读取`dirtywords.txt`文件并处理输入字符串。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

前言
本章讲述使用256tree过滤脏字
c++ golang 2个版本
老早以前在 https://github.com/progtesttes 写的
这里稍微优化下

1：c++ code
dirtywords.h

#if !defined DIRTY_WORDS_H_ 
#define DIRTY_WORDS_H_ 
//#include<stdio.h>
#include<string.h>
class CFilterDirtyWords
{
private:
	CFilterDirtyWords();
	~CFilterDirtyWords();
public:
	static CFilterDirtyWords* GetInstance();
	void	ReleaseByOwner() { delete this;  }
private:
	typedef struct _dirtytree
	{
		bool bend;
		struct _dirtytree * subtree[256];
		_dirtytree() {
			bend = false;
			memset(subtree, 0, sizeof(_dirtytree*) * 256);
		}
	}DIRTYTREE, *PDIRTYTREE;

	PDIRTYTREE m_phead;
	static CFilterDirtyWords* pFilterDirtyWords;
private:
	bool loaddirtywords(const char* filepath);
	bool hasdirtywords(const PDIRTYTREE pHead, const char *  pstring);
	void filterdirtywords(const PDIRTYTREE pHead, char * pstring);
	void insertdirtywords(PDIRTYTREE& pHead, const char *  pstring);
	void releasedirtytree(PDIRTYTREE pHead);
public:
	bool LoadDirtyFile(const char* filepath=nullptr);
	bool HasDirtyWords(const char* lpstr);
	void FilterDirtyWords(char * pstring);
};
#endif

dirtywords.cpp

#include <stdio.h>
#include<ctype.h>
#include "dirtywords.h"
#define  CONFIG_DIRTY_WORDS   "dirtywords.txt"

CFilterDirtyWords* CFilterDirtyWords::pFilterDirtyWords = NULL;

CFilterDirtyWords::CFilterDirtyWords()
{
	m_phead = NULL;
}

CFilterDirtyWords::~CFilterDirtyWords()
{
	releasedirtytree(m_phead);
}

CFilterDirtyWords* CFilterDirtyWords::GetInstance()
{
	if(pFilterDirtyWords == NULL) {
		pFilterDirtyWords = new CFilterDirtyWords();
	}
	return pFilterDirtyWords;
}


bool CFilterDirtyWords::LoadDirtyFile(const char* filepath)
{
	return loaddirtywords(filepath);
}

bool CFilterDirtyWords::loaddirtywords(const char* filepath)
{
	FILE * f = fopen(filepath== nullptr? CONFIG_DIRTY_WORDS : filepath, "r");
	if (NULL == f) {
		return false;
	}
	char szbuf[256];
	PDIRTYTREE phead = NULL;
	while (NULL != fgets(szbuf, 256, f)) {
		insertdirtywords(phead, szbuf);
	}
	fclose(f);
	m_phead = phead;
	if (NULL == m_phead) {
		printf("CFilterDirtyWords::loaddirtywords is NULL"); return false;
	}
	return true;
	//	return m_phead?true:false ;
}
void CFilterDirtyWords::filterdirtywords(const PDIRTYTREE pHead, char * pstring)
{
	if (!pHead) return;
	PDIRTYTREE pTree = pHead;
	unsigned char ch = '\0';
	int pos = 0;
	char * pTemp = pstring;
	bool bBegin = false;
	while (*pTemp != '\0')
	{
		ch = isupper(*pTemp) ? _tolower(*pTemp) : *pTemp;
		if (pTree->subtree[ch]) {
			if (!bBegin) {
				bBegin = true;  pos = pTemp - pstring;
			}
			pTree = pTree->subtree[ch];

			if (pTree->bend) {
				while (pos <= pTemp - pstring) *(pstring + pos++) = '*';
			}

		}
		else if (bBegin && pHead->subtree[ch]) {
			pos = pTemp - pstring; pTree = pHead->subtree[ch];
			if (pTree->bend) {
				while (pos <= pTemp - pstring) *(pstring + pos++) = '*';
			}
		}
		else {
			pTree = pHead;  bBegin = false;
		}
		++pTemp;
	}
}
void CFilterDirtyWords::insertdirtywords(PDIRTYTREE& pHead, const char *  pstring)
{
	if (!pstring) return;
	if (!pHead) pHead = new DIRTYTREE;
	const char * pTemp = (char*)pstring;
	PDIRTYTREE pTree = pHead;
	unsigned char ch = '\0';
	while (*pTemp != '\0' && *pTemp != '\r' && *pTemp != '\n') {
		ch = isupper(*pTemp) ? _tolower(*pTemp) : *pTemp;
		if (!pTree->subtree[ch]) pTree->subtree[ch] = new DIRTYTREE;
		pTree = pTree->subtree[ch];
		++pTemp;
	}
	pTree->bend = true;
}
void CFilterDirtyWords::releasedirtytree(PDIRTYTREE pHead)
{
	if (!pHead) return;
	for (unsigned int i = 0; i< 256; i++) {
		releasedirtytree(pHead->subtree[i]);
	}
	delete pHead;
}

bool CFilterDirtyWords::hasdirtywords(const PDIRTYTREE pHead, const char *  pstring)
{
	if (!pHead) return false;
	PDIRTYTREE pTree = pHead;
	unsigned char ch = '\0';
	char * pTemp = (char*)pstring;
	while (*pTemp != '\0')
	{
		ch = isupper(*pTemp) ? _tolower(*pTemp) : *pTemp;
		if (pTree->subtree[ch]) {
			pTree = pTree->subtree[ch];
			if (pTree->bend) {
				return true;
			}
		}
		else {
			pTree = pHead;
		}
		++pTemp;
	}
	return false;
}

bool CFilterDirtyWords::HasDirtyWords(const char *  pstring)
{
	return hasdirtywords(m_phead, pstring);
}

void CFilterDirtyWords::FilterDirtyWords(char * pstring)
{
	filterdirtywords(m_phead, pstring);
}

main.cpp

#include "dirtywords.h"
#include <stdio.h>
int main() {

	if (CFilterDirtyWords::GetInstance()->LoadDirtyFile()) {
		printf("%d \n", CFilterDirtyWords::GetInstance()->HasDirtyWords("123"));  //1
		printf("%d \n", CFilterDirtyWords::GetInstance()->HasDirtyWords("12"));   //0
	}
	CFilterDirtyWords::GetInstance()->ReleaseByOwner() ;
	return  0;
}


/*
dirtywords.txt 内容如下
132
123
121
1221
1121
*/

运行结果
在这里插入图片描述

2：golang code
dirtyword.go

package dityword

import (
	"bufio"
	"io"
	"log"
	"os"
	"strings"
)

//256 tree
type dirtytree struct {
	bend    bool
	subtree [256]*dirtytree
}

var (
	dirtyhead *dirtytree = nil
)

func loaddirtywords(filename string) bool {

	fi, err := os.Open(filename)
	if err != nil {
		log.Printf("filename=%v Error: %s\n", filename, err)
		return false
	}
	defer fi.Close()

	phead := new(dirtytree)

	br := bufio.NewReader(fi)
	for {
		a, _, c := br.ReadLine()
		if c == io.EOF {
			break
		}
		//	log.Printf("a=%v \n",string(a))
		l := len(a)
		if l < 1 {
			continue
		}
		if l > 256 {
			a = a[:256]
		}
		//fmt.Println(string(a))
		insertdirtywords(phead, a)
	}
	dirtyhead = phead
	return true
}

func hasdirtywords(phead *dirtytree, str string) bool {
	if phead == nil {
		return false
	}
	var pTree *dirtytree = phead
	//log.Printf("cmp string=%#v \n",str)
	strlower := []byte(strings.ToLower(string(str)))
	l := len([]byte(strlower))
	if l < 1 {
		return false
	}
	//log.Printf("cmp ToLower string=%#v \n",string(strlower))

	for i := 0; i < l; i++ {
		ch := byte(strlower[i])
		if pTree.subtree[ch] != nil {
			pTree = pTree.subtree[ch]
			if pTree.bend {
				return true
			}
		} else {
			pTree = phead
		}
	}
	return false
}

//func filterdirtywords(phead *dirtytree,str string)  {
//
//}

func insertdirtywords(phead *dirtytree, str []byte) {

	//全部小写
	//	log.Printf("org        string=%#v \n",str)
	strlower := []byte(strings.ToLower(string(str)))
	l := len([]byte(strlower))
	if l < 1 {
		return
	}
	//	log.Printf("org ToLower string=%#v \n",string(strlower))
	//	log.Printf("org ToLower string=%#v \n",strlower)
	if phead == nil {
		phead = new(dirtytree)
	}
	pTree := phead

	for i := 0; i < l; i++ {
		ch := byte(strlower[i])
		if pTree.subtree[ch] == nil {
			pTree.subtree[ch] = new(dirtytree)
			pTree = pTree.subtree[ch]
		}
	}
	pTree.bend = true
}

//func releasedirtytree(phead *dirtytree)  {

//}

//api////////////////////////////////////////////////
func LoadDirtyWordsFile(filename string) bool {
	return loaddirtywords(filename)
}

func HasDirtyWords(chstr string) bool {

	return hasdirtywords(dirtyhead, chstr)
}

//func FilterDirtyWords(filterstr string)  {
//
//}

main.go

package main

import (
	"bytes"
	"dirtywords/dityword"
	"fmt"
	"github.com/henrylee2cn/mahonia"
	"log"
	"os"
	"path"
	"regexp"
	"unicode/utf8"
)

func check(src string) bool {
	str := "(?:')|(?:--)|(/\\*(?:.|[\\n\\r])*?\\*/)|(\b(select|update|and|or|delete|insert|trancate|char|chr|into|substr|ascii|declare|exec|count|master|into|drop|execute)\b)" //此处改为“
	re, err := regexp.Compile(str)
	if err != nil {
		fmt.Println(err.Error())
		return true
	}
	b := re.MatchString(src)
	fmt.Println("lllll", b) //打印出false。
	return b
}

func main() {

	//1读取配置文件连
	cfgpath, _ := os.Getwd()
	filename := path.Join(cfgpath, "ditylist.txt")
	if !dityword.LoadDirtyWordsFile(filename) {
		os.Exit(1)
	}

	for {
		var input string
		fmt.Scanln(&input)
		log.Printf("input=%v len=%v \n", input, len(input))
		if utf8.ValidString(input) {

			enc := mahonia.NewEncoder("gbk")
			gbkstr := enc.ConvertString(input)
			log.Printf("gbkstr=%v \n", []byte(gbkstr))
			b := dityword.HasDirtyWords(gbkstr)
			usrc := bytes.Runes([]byte(input))
			log.Printf("check b=%v uscr=%#v %v\n", b, usrc, len(usrc))

			//	2018/05/26 00:02:12 input=日 len=3
			//	2018/05/26 00:02:12 gbkstr=[200 213]
			//	2018/05/26 00:02:12 check b=true uscr=[]int32{26085} 1

			//r, size := utf8.DecodeRuneInString(input)
			//fmt.Printf("%c %v\n", r, size)

			//	newdata := string(([]byte(input))[size:])
			//	fmt.Printf("%c %v  data=%v \n", r, size,newdata)
			//str = str[size:]
			//	if data,num := utf8.DecodeRuneInString(input); ok {
			//		b := dityword.HasDirtyWords(input)
			//		fmt.Printf("check b=%v \n",b)
			//	}

		}

	}

}

/*
ditylist.txt 内容如下
fyou
fky
fyou1
*/