java 根据中文包含词和中文排除词,匹配文章中的命中的词组规则

文章:"小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因";

+代表与,|代码或,整个规则必须用()括起来

当排除词的规则命中其中一个时,整篇文章视为不匹配,返回false

包含词规则:"(小学生|(思绪|想象力+适应期))"

排除词规则:"(小x生)"

源码:WordMatcher.java

类文件 BoolParse.java在 我以前发布的文章《java 求字符串形式bool表达式的值》中:

import org.apache.commons.lang3.StringUtils;

import java.util.*;

import java.util.function.Function;


public class WordMatcher {
    private final List<String> includeExprList;
    private final List<String> excludeExprList;

    private final Map<String, Character> wordMapId = new HashMap<>();
    private final Map<Character, String> idMapWord = new HashMap<>();
    private final Tree tree = new Tree();
    private Set<String> hitExpr = null;
    private final int includeWordCount;


    public WordMatcher(String includeExpr, String excludeExpr) {
        if (StringUtils.isBlank(includeExpr)) throw new RuntimeException("包含词不能为空");

        Function<String, List<String>> function = (expr) -> {
            List<String> exprList = new ArrayList<>();
            StringBuilder newExpr = new StringBuilder();
            StringBuilder word = new StringBuilder();
            int level = 0;
            char[] chars = expr.toCharArray();
            boolean splitAble=false;
            for (int i = 0; i < chars.length; i++) {
                char aChar=chars[i];
                switch (aChar) {
                    case ' ':
                        continue;
                    case '(':
                        splitAble=false;
                        level++;
                        newExpr.append(aChar);
                        continue;
                    case ')':
                        splitAble=true;
                        level--;
                        break;
                    case '|':
                        splitAble=false;
                        break;
                    case '+':
                        aChar = '&';
                        splitAble=chars[i+1]=='(';
                        break;
                    default:
                        splitAble=false;
                        word.append(aChar);
                        continue;

                }

                String s = word.toString();
                if (s.isEmpty()) {
                    newExpr.append(aChar);
                   continue;
                }
                tree.insert(s);
                char id;
                if (!wordMapId.containsKey(s)) {
                    char c = (char) (wordMapId.size() + 256);
                    wordMapId.put(s, c);
                    idMapWord.put(c, s);
                    id = c;
                } else {
                    id = wordMapId.get(s);
                }
                word = new StringBuilder();
                newExpr.append(id);

                if (splitAble && aChar == '&') {
                    exprList.add(newExpr.toString());
                    newExpr = new StringBuilder();
                    continue;
                }

                newExpr.append(aChar);
            }


            if (level != 0 && word.length() != 0) throw new RuntimeException("表达式语法错误:" + expr);
            if (word.length() != 0) {
                String s = word.toString();
                tree.insert(s);
                char c = (char) (wordMapId.size() + 256);
                wordMapId.put(s, c);
                idMapWord.put(c, s);
                newExpr.append(c);
            }
            if (newExpr.length() != 0) {
                exprList.add(newExpr.toString());
            }
            return exprList;
        };
        this.includeExprList = function.apply(includeExpr);
        if (includeExprList.isEmpty()) throw new RuntimeException("包含词表达式不能为空");
        this.includeWordCount = idMapWord.size();
        if (StringUtils.isBlank(excludeExpr)) this.excludeExprList = new ArrayList<>();
        else this.excludeExprList = function.apply(excludeExpr);
    }

    public int wordCount() {
        return idMapWord.size();
    }

    public int includeWordCount() {
        return includeWordCount;
    }


    public int excludeWordCount() {
        return wordCount() - includeWordCount;
    }

    public boolean match(String content) {
        Objects.requireNonNull(content);
        if (content.isEmpty()) throw new RuntimeException("empty string");
        Set<String> existWords = new HashSet<>();
        List<StringBuilder> builders = new ArrayList<>();
        for (char c : content.toCharArray()) {
            {
                for (StringBuilder builder : builders) {
                    builder.append(c);
                }
                String str = c + "";
                boolean b = tree.containsTheWord(str);
                if (b) {
                    builders.add(new StringBuilder(str));
                }
                if (tree.existTheWord(str)) {
                    existWords.add(str);
                }
            }
            builders.removeIf(builder -> {
                if (builder.length() == 1) return false;
                String str = builder.toString();
                boolean b = tree.containsTheWord(str);
                if (!b) return true;
                boolean exist = tree.existTheWord(str);
                if (exist) {
                    existWords.add(str);
                }
                return false;
            });
        }
        for (String exclude : excludeExprList) {
            for (String word : wordMapId.keySet()) {
                String id = wordMapId.get(word) + "";
                if (existWords.contains(word)) {
                    exclude = exclude.replaceAll(id + "", "T");
                } else {
                    exclude = exclude.replaceAll(id + "", "F");
                }
            }
            if (BoolParser.parse(exclude)) {
                hitExpr = new HashSet<>();
                return false;
            }
        }
        Set<String> exprSet = new HashSet<>();
        boolean result = false;
        for (String include : includeExprList) {
            String includeTemp = include;
            for (String word : wordMapId.keySet()) {
                String id = wordMapId.get(word) + "";
                if (existWords.contains(word)) {
                    include = include.replaceAll(id + "", "T");
                } else {
                    include = include.replaceAll(id + "", "F");
                }
            }
            result= BoolParser.parse(include);
            if (result) {
                for (Character id : idMapWord.keySet()) {
                    String word = idMapWord.get(id);
                    includeTemp = includeTemp.replaceAll(id + "", word);
                }
                exprSet.add(includeTemp.replaceAll("&", "+"));
                continue;
            }
            break;
        }
        hitExpr = exprSet;
        return result;
    }

    public Set<String> hitExpr() {
        if (hitExpr == null) throw new RuntimeException("请先匹配文章");
        return hitExpr;
    }

    private static class Tree {
        private final Map<Character, Node> nodes = new HashMap<>();

        public Tree() {
        }

        public void insert(String word) {
            Objects.requireNonNull(word);
            if (word.isEmpty()) return;
            char[] chars = word.toCharArray();
            Node head = nodes.computeIfAbsent(chars[0], Node::new);
            for (int i = 1; i < chars.length; i++) {
                char aChar = chars[i];
                head = head.putChild(aChar);
            }
        }

        public boolean containsTheWord(String word) {
            Objects.requireNonNull(word);
            if (word.isEmpty()) throw new RuntimeException("empty string");
            char[] chars = word.toCharArray();
            if (!nodes.containsKey(chars[0])) return false;
            Node node = nodes.get(chars[0]);
            for (int i = 1; i < chars.length; i++) {
                char aChar = chars[i];
                Node child = node.getChild(aChar);
                if (child == null) return false;
                node = child;
            }
            return true;
        }

        public boolean existTheWord(String word) {
            Objects.requireNonNull(word);
            if (word.isEmpty()) throw new RuntimeException("empty string");
            char[] chars = word.toCharArray();
            if (!nodes.containsKey(chars[0])) return false;
            Node node = nodes.get(chars[0]);
            for (int i = 1; i < chars.length; i++) {
                char aChar = chars[i];
                Node child = node.getChild(aChar);
                if (child == null) return false;
                node = child;
            }
            return node.isEnd();
        }
    }

    private static class Node {
        protected final char value;
        private final Map<Character, Node> childNodes = new HashMap<>();

        public Node(char value) {
            this.value = value;
        }

        public Node putChild(char value) {
            return childNodes.computeIfAbsent(value, Node::new);
        }

        public Node getChild(char value) {
            return childNodes.get(value);
        }

        public boolean containsNode(char value) {
            return childNodes.containsKey(value);
        }

        public boolean isEnd() {
            return childNodes.isEmpty();
        }
    }
}

使用示例1

    public static void main(String[] args) {
        WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应a))", "(小x生)");
        boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
        System.out.println(result);//false
        System.out.println(matcher.hitExpr());//[]
    }

使用示例2

    public static void main(String[] args) {
        WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应期))", "");
        boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
        System.out.println(result);//true
        System.out.println(matcher.hitExpr());//[(思绪|想象力+适应期), 小学生]
    }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值