go 基于DFA算法的敏感词过滤包

package sensitive

import (
	"bufio"
	"io/ioutil"
	"os"
	"path/filepath"
	"strings"
	"sync"
)

type MATCHTYPE int

const (
	ALL MATCHTYPE = iota
	SINGLE

	INVALID_WORDS           = " ,~,!,@,#,$,%,^,&,*,(,),_,-,+,=,?,<,>,.,—,,,。,/,\\,|,《,》,?,;,:,:,',‘,;,“,!,。,;,:,’,{,},【,】,[,],、"
	SENSITIVE_CHILDRED_SIZE = 128

	LEXICON_PATH = "./pkg/sensitive/lexicon" //todo:根据项目文件结构来修改该词库目录路径
)

var InvalidWords = make(map[string]interface{})
var SensitiveWords = make([]string, 20000)
var Util *DFAUtil

func Setup() {
	invalidArr := strings.Split(INVALID_WORDS, ",")
	for _, v := range invalidArr {
		InvalidWords[v] = nil
	}

	var fileList []string
	dir, err := ioutil.ReadDir(LEXICON_PATH)
	if err != nil {
		panic(err)
	}
	for _, fi := range dir {
		fileList = append(fileList, filepath.Join(LEXICON_PATH, fi.Name()))
	}

	if len(fileList) == 0 {
		panic("请添加敏感词文件")
	}

	for _, fileName := range fileList {
		r, _ := os.Open(fileName)
		defer r.Close()

		s := bufio.NewScanner(r)
		for s.Scan() {
			SensitiveWords = append(SensitiveWords, s.Text())
		}
	}

	dfaUtil := &DFAUtil{
		root: newSensitiveNode(),
	}

	for _, word := range SensitiveWords {
		sensitiveRune := []rune(word)

		if len(sensitiveRune) > 1 {
			dfaUtil.AddWord(sensitiveRune)
		}
	}

	Util = dfaUtil
}

type sensitiveNode struct {
	isEnd bool

	children map[rune]*sensitiveNode
}

func newSensitiveNode() *sensitiveNode {
	return &sensitiveNode{
		children: make(map[rune]*sensitiveNode, SENSITIVE_CHILDRED_SIZE),
	}
}

type DFAUtil struct {
	root *sensitiveNode

	mu sync.Mutex
}

func NewDFAUtil() *DFAUtil {
	return Util
}

func (dfaUtil *DFAUtil) AddWord(word []rune) {
	if dfaUtil.root == nil {
		return
	}

	dfaUtil.mu.Lock()
	defer dfaUtil.mu.Unlock()

	currNode := dfaUtil.root
	for _, single := range word {
		if targetNode, exist := currNode.children[single]; !exist {
			targetNode = newSensitiveNode()
			currNode.children[single] = targetNode
			currNode = targetNode
		} else {
			currNode = targetNode
		}
	}

	currNode.isEnd = true
}

func (dfaUtil *DFAUtil) Contains(sentence string) bool {
	var flag = false
	var matchFlag = 0
	sentenceRune := []rune(sentence)
	currNode := dfaUtil.root
	length := len(sentenceRune)

	for i := 0; i < length; i++ {
		if _, exist := InvalidWords[string(sentenceRune[i])]; exist {
			continue
		}

		if targetNode, exist := currNode.children[sentenceRune[i]]; exist {
			matchFlag++
			currNode = targetNode
			if currNode.isEnd {
				flag = true
				break
			}
		} else {
			currNode = dfaUtil.root
		}
	}

	if matchFlag < 2 || !flag {
		return false
	}

	return true
}

func (dfaUtil *DFAUtil) SearchSensitive(sentence string, matchType MATCHTYPE) (matchIndexList []*matchIndex) {
	sentenceRune := []rune(sentence)
	currNode := dfaUtil.root
	tag, start := -1, -1
	length := len(sentenceRune)

	for i := 0; i < length; i++ {
		if _, exist := InvalidWords[string(sentenceRune[i])]; exist {
			continue
		}

		if targetNode, exist := currNode.children[sentenceRune[i]]; exist {
			tag++
			if tag == 0 {
				start = i
			}

			currNode = targetNode
			if currNode.isEnd {
				matchIndexList = append(matchIndexList, newMatchIndex(start, i))
				if matchType == SINGLE {
					return matchIndexList
				}

				//重新回到树的顶部,找下一个敏感词
				currNode = dfaUtil.root
				tag, start = -1, -1
			}
		} else {
			if start != -1 {
				i = start
			}

			currNode = dfaUtil.root
			tag, start = -1, -1
		}
	}

	return matchIndexList
}

func (dfaUtil *DFAUtil) Cover(sentence string, mask rune) (string, bool) {
	matchIndexList := dfaUtil.SearchSensitive(sentence, ALL)
	if len(matchIndexList) == 0 {
		return sentence, false
	}

	sentenceRune := []rune(sentence)
	for _, matchIndexStruct := range matchIndexList {
		for i := matchIndexStruct.start; i <= matchIndexStruct.end; i++ {
			sentenceRune[i] = mask
		}
	}

	return string(sentenceRune), true
}

type matchIndex struct {
	start int
	end   int
}

func newMatchIndex(start, end int) *matchIndex {
	return &matchIndex{
		start: start,
		end:   end,
	}
}

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值