文章:"小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因";
+代表与,|代码或,整个规则必须用()括起来
当排除词的规则命中其中一个时,整篇文章视为不匹配,返回false
包含词规则:"(小学生|(思绪|想象力+适应期))"
排除词规则:"(小x生)"
源码:WordMatcher.java
类文件 BoolParse.java在 我以前发布的文章《java 求字符串形式bool表达式的值》中:
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.function.Function;
public class WordMatcher {
private final List<String> includeExprList;
private final List<String> excludeExprList;
private final Map<String, Character> wordMapId = new HashMap<>();
private final Map<Character, String> idMapWord = new HashMap<>();
private final Tree tree = new Tree();
private Set<String> hitExpr = null;
private final int includeWordCount;
public WordMatcher(String includeExpr, String excludeExpr) {
if (StringUtils.isBlank(includeExpr)) throw new RuntimeException("包含词不能为空");
Function<String, List<String>> function = (expr) -> {
List<String> exprList = new ArrayList<>();
StringBuilder newExpr = new StringBuilder();
StringBuilder word = new StringBuilder();
int level = 0;
char[] chars = expr.toCharArray();
boolean splitAble=false;
for (int i = 0; i < chars.length; i++) {
char aChar=chars[i];
switch (aChar) {
case ' ':
continue;
case '(':
splitAble=false;
level++;
newExpr.append(aChar);
continue;
case ')':
splitAble=true;
level--;
break;
case '|':
splitAble=false;
break;
case '+':
aChar = '&';
splitAble=chars[i+1]=='(';
break;
default:
splitAble=false;
word.append(aChar);
continue;
}
String s = word.toString();
if (s.isEmpty()) {
newExpr.append(aChar);
continue;
}
tree.insert(s);
char id;
if (!wordMapId.containsKey(s)) {
char c = (char) (wordMapId.size() + 256);
wordMapId.put(s, c);
idMapWord.put(c, s);
id = c;
} else {
id = wordMapId.get(s);
}
word = new StringBuilder();
newExpr.append(id);
if (splitAble && aChar == '&') {
exprList.add(newExpr.toString());
newExpr = new StringBuilder();
continue;
}
newExpr.append(aChar);
}
if (level != 0 && word.length() != 0) throw new RuntimeException("表达式语法错误:" + expr);
if (word.length() != 0) {
String s = word.toString();
tree.insert(s);
char c = (char) (wordMapId.size() + 256);
wordMapId.put(s, c);
idMapWord.put(c, s);
newExpr.append(c);
}
if (newExpr.length() != 0) {
exprList.add(newExpr.toString());
}
return exprList;
};
this.includeExprList = function.apply(includeExpr);
if (includeExprList.isEmpty()) throw new RuntimeException("包含词表达式不能为空");
this.includeWordCount = idMapWord.size();
if (StringUtils.isBlank(excludeExpr)) this.excludeExprList = new ArrayList<>();
else this.excludeExprList = function.apply(excludeExpr);
}
public int wordCount() {
return idMapWord.size();
}
public int includeWordCount() {
return includeWordCount;
}
public int excludeWordCount() {
return wordCount() - includeWordCount;
}
public boolean match(String content) {
Objects.requireNonNull(content);
if (content.isEmpty()) throw new RuntimeException("empty string");
Set<String> existWords = new HashSet<>();
List<StringBuilder> builders = new ArrayList<>();
for (char c : content.toCharArray()) {
{
for (StringBuilder builder : builders) {
builder.append(c);
}
String str = c + "";
boolean b = tree.containsTheWord(str);
if (b) {
builders.add(new StringBuilder(str));
}
if (tree.existTheWord(str)) {
existWords.add(str);
}
}
builders.removeIf(builder -> {
if (builder.length() == 1) return false;
String str = builder.toString();
boolean b = tree.containsTheWord(str);
if (!b) return true;
boolean exist = tree.existTheWord(str);
if (exist) {
existWords.add(str);
}
return false;
});
}
for (String exclude : excludeExprList) {
for (String word : wordMapId.keySet()) {
String id = wordMapId.get(word) + "";
if (existWords.contains(word)) {
exclude = exclude.replaceAll(id + "", "T");
} else {
exclude = exclude.replaceAll(id + "", "F");
}
}
if (BoolParser.parse(exclude)) {
hitExpr = new HashSet<>();
return false;
}
}
Set<String> exprSet = new HashSet<>();
boolean result = false;
for (String include : includeExprList) {
String includeTemp = include;
for (String word : wordMapId.keySet()) {
String id = wordMapId.get(word) + "";
if (existWords.contains(word)) {
include = include.replaceAll(id + "", "T");
} else {
include = include.replaceAll(id + "", "F");
}
}
result= BoolParser.parse(include);
if (result) {
for (Character id : idMapWord.keySet()) {
String word = idMapWord.get(id);
includeTemp = includeTemp.replaceAll(id + "", word);
}
exprSet.add(includeTemp.replaceAll("&", "+"));
continue;
}
break;
}
hitExpr = exprSet;
return result;
}
public Set<String> hitExpr() {
if (hitExpr == null) throw new RuntimeException("请先匹配文章");
return hitExpr;
}
private static class Tree {
private final Map<Character, Node> nodes = new HashMap<>();
public Tree() {
}
public void insert(String word) {
Objects.requireNonNull(word);
if (word.isEmpty()) return;
char[] chars = word.toCharArray();
Node head = nodes.computeIfAbsent(chars[0], Node::new);
for (int i = 1; i < chars.length; i++) {
char aChar = chars[i];
head = head.putChild(aChar);
}
}
public boolean containsTheWord(String word) {
Objects.requireNonNull(word);
if (word.isEmpty()) throw new RuntimeException("empty string");
char[] chars = word.toCharArray();
if (!nodes.containsKey(chars[0])) return false;
Node node = nodes.get(chars[0]);
for (int i = 1; i < chars.length; i++) {
char aChar = chars[i];
Node child = node.getChild(aChar);
if (child == null) return false;
node = child;
}
return true;
}
public boolean existTheWord(String word) {
Objects.requireNonNull(word);
if (word.isEmpty()) throw new RuntimeException("empty string");
char[] chars = word.toCharArray();
if (!nodes.containsKey(chars[0])) return false;
Node node = nodes.get(chars[0]);
for (int i = 1; i < chars.length; i++) {
char aChar = chars[i];
Node child = node.getChild(aChar);
if (child == null) return false;
node = child;
}
return node.isEnd();
}
}
private static class Node {
protected final char value;
private final Map<Character, Node> childNodes = new HashMap<>();
public Node(char value) {
this.value = value;
}
public Node putChild(char value) {
return childNodes.computeIfAbsent(value, Node::new);
}
public Node getChild(char value) {
return childNodes.get(value);
}
public boolean containsNode(char value) {
return childNodes.containsKey(value);
}
public boolean isEnd() {
return childNodes.isEmpty();
}
}
}
使用示例1
public static void main(String[] args) {
WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应a))", "(小x生)");
boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
System.out.println(result);//false
System.out.println(matcher.hitExpr());//[]
}
使用示例2
public static void main(String[] args) {
WordMatcher matcher = new WordMatcher("(小学生|(思绪|想象力+适应期))", "");
boolean result = matcher.match("小学生刚刚步入校园生活,还在适应期,虽然坐在教室,但是思绪不知道飘到哪里去了,这也是他们想象力如此丰富的原因");
System.out.println(result);//true
System.out.println(matcher.hitExpr());//[(思绪|想象力+适应期), 小学生]
}