某电商机试题
你需要实现英文单词的过滤屏蔽功能,需求列表如下
- 能够屏蔽单个单词,例如当敏感词是"nice"时,那么输入" You are a nIce person”,输出为" You are a XXXX person’
- 能够屏蔽多个单词,例如当敏感词是"nice,“sun’,” happy’,那么输入为" Such a nice day with a bright sun, makes me happy"时,输出为 Such a XXXX day with a bright XXX, makes me XXXXX"
- .能够替换掉包含敏感词作为前缀的单词,例如当敏感词是" 'friend",输入"You re so friendly!",输出为" You are so XXXXXXXXX !"
- 能够指定将敏慼词替换为指定的单词,例如定义了下述字典,那么输入 Objection is bad, a better thing to do, is to agree…,输出为 Thoughtcrime is ungood, a gooder thing to do, is to crimestopo
bad->ungood
better-> gooder
agree-> crimestop
objection-> thoughtcrime - 能够替換掉包含敏感词作为后綴的单词,例如当敏感词是" happy",输入"Areyou unhappy today?.输出为" Are you XXXXXXX today?"
补充
- 如果没有指定替换的词,使用默认的字符替换,比如X。
- 只有完全匹配才使用指定的单词替换。
- 如果是前置匹配或者后置匹配,整个单词使用默认字符替换,并且和原单词长度一致。
- 中间包含不做处理。
题目理解
- 题目本身的要求有5点,简单来说就是字符串替换。
- 要考虑到敏感词库特别多的情况。
- 可以动态新增和修改敏感词库。
解题思路
- String自带方法,replaceAll方法,这种如果是包含关系或者前置后置都不行。
- 使用map保存敏感词的key和value,遍历字符串,替换单词,这里也不能很好处理前置后置问题。
- 使用字典树,最终使用这种方案,方便扩展。
代码
基本节点
- 保存子节点信息
- 当前节点敏感词
- 当前节点替换词
- 父节点引用
package com;
import java.util.HashMap;
import java.util.Map;
/**
* @author gang.tu
* @ClassName Node
* @Description 基本节点
*/
public class Node {
private Map<Character, Node> childNodes;
private String source;
private String target;
private Node parentNode;
public Node() {
childNodes = new HashMap<>();
}
public Map<Character, Node> getChildNodes() {
return childNodes;
}
public void setChildNodes(Map<Character, Node> childNodes) {
this.childNodes = childNodes;
}
public String getTarget() {
return target;
}
public void setTarget(String target) {
this.target = target;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public Node getParentNode() {
return parentNode;
}
public void setParentNode(Node parentNode) {
this.parentNode = parentNode;
}
}
字典树
- 实现基本的crud功能
- 实现过滤功能
package com;
import com.sun.deploy.util.StringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* @author gang.tu
* @Description 字符串字典树
*/
public class NodeTree {
private Node root;
private String defaultTarget;
public NodeTree() {
root = new Node();
defaultTarget = "X";
}
/**
* @description 指定默认替代值
* @author gang.tu
*/
public NodeTree(String defaultTarget) {
root = new Node();
this.defaultTarget = defaultTarget;
}
/**
* @description 新增结点, 不知道target将使用默认值
* @author gang.tu
* @return: boolean
*/
boolean insert(String source) {
return insert(source, defaultTarget);
}
/**
* @description 新增结点
* @author gang.tu
* @return: boolean
*/
boolean insert(String source, String target) {
Node node = root;
for (int i = 0; i < source.length(); i++) {
Character c = source.charAt(i);
if (!node.getChildNodes().containsKey(c)) {
Node childNode = new Node();
childNode.setParentNode(node);
node.getChildNodes().put(c, childNode);
}
node = node.getChildNodes().get(c);
}
if (node.getTarget() == null) {
node.setTarget(target);
node.setSource(source);
return true;
}
return false;
}
/**
* @description 修改结点
* @author gang.tu
* @return: boolean
*/
boolean update(String source, String target) {
Node node = query(source);
if (node == null) {
return false;
}
node.setTarget(target);
return true;
}
/**
* @description 精确查询
* @author gang.tu
*/
Node query(String source) {
Node node = root;
for (int i = 0; i < source.length(); i++) {
Character c = source.charAt(i);
if (!node.getChildNodes().containsKey(c)) {
return null;
}
node = node.getChildNodes().get(c);
}
return node;
}
/**
* @description 删除结点, 如果没有子结点,将移除父级结点
* @author gang.tu
* @return: boolean
*/
boolean delete(String source) {
Node node = query(source);
if (node == null) {
return false;
}
node.setTarget(null);
node.setSource(null);
if (node.getChildNodes().isEmpty()) {
node.getParentNode().getChildNodes().remove(source.charAt(source.length() - 1));
}
return true;
}
/**
* @description 通过单词,判断是否包含敏感词
* @author gang.tu
*/
String searchTarget(String word) {
Node node = searchNode(word);
if (node != null) {
if (word.startsWith(node.getSource()) || word.endsWith(node.getSource())) {
if (node.getTarget().equals(defaultTarget) || !node.getSource().equals(word)) {
return Stream.generate(() -> defaultTarget).limit(word.length()).collect(Collectors.joining());
}
return node.getTarget();
}
}
return word;
}
/**
* @description 通过单词,判断是否包含敏感词
* @author gang.tu
*/
Node searchNode(String word) {
if (word.length() == 0) {
return null;
}
Node node = root;
for (int i = 0; i < word.length(); i++) {
Character c = word.charAt(i);
if (!node.getChildNodes().containsKey(c)) {
continue;
}
node = node.getChildNodes().get(c);
if (node.getTarget() != null) {
return node;
}
Node node1 = searchNode(word.substring(i + 1));
if (node1 != null) {
return node1;
}
}
return null;
}
/**
* @description 过滤敏感单词
* @author gang.tu
* @return: java.lang.String
*/
String filter(String str) {
List<String> list = new ArrayList<>();
for (int begin = 0, i = 0; i < str.length(); i++) {
char c = str.charAt(i);
boolean isisAlpha = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
if (isisAlpha) {
if (i == str.length() - 1) {
String temp = str.substring(begin);
list.add(searchTarget(temp));
}
} else {
String temp = str.substring(begin, i);
list.add(searchTarget(temp));
list.add(String.valueOf(c));
begin = i + 1;
}
}
return StringUtils.join(list, "");
}
}
临时工具类
- 读取文件
- 字符串分割
- 判断是否为字符
package com;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
/**
* @author gang.tu
* @description 临时工具类,不考虑优化以及规范
*/
public class Utils {
private Utils() {
}
/**
* @author gang.tu
* @description 读取规则配置文件,
* @return: java.util.List<java.lang.String>
*/
public static List<String> getTxt(String path) {
List<String> resList = new ArrayList<>();
File f = new File(path);
if (f.isFile() && f.exists()) {
try (InputStream file = new FileInputStream(f);
InputStreamReader read = new InputStreamReader(file, "GBK");
BufferedReader bufferedReader = new BufferedReader(read);) {
String line;
while ((line = bufferedReader.readLine()) != null) {
resList.add(line);
}
} catch (Exception e) {
e.printStackTrace();
}
}
return resList;
}
/**
* @description 根据指定的字符分割字符串
* @author gang.tu
* @return: java.lang.String[]
*/
public static String[] splitByStringTokenizer(String source, String delim) {
StringTokenizer st = new StringTokenizer(source, delim);
String[] str = new String[st.countTokens()];
int i = 0;
while (st.hasMoreTokens()) {
str[i] = st.nextToken();
i++;
}
return str;
}
/**
* @description 是否是字母
* @author gang.tu
* @return: boolean
*/
public static boolean isAlpha(String str) {
if (str == null) return false;
return str.matches("[a-zA-Z]+");
}
}
测试类
package com;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.util.List;
import static org.junit.Assert.*;
/**
* @author gang.tu
* @ClassName NodeTreeTest
*/
public class NodeTreeTest {
NodeTree nodeTree;
@Before
public void init() {
nodeTree = new NodeTree();
List<String> txtList = Utils.getTxt("src/main/resources/map.txt");
txtList.stream().forEach(e -> {
String[] rule = Utils.splitByStringTokenizer(e, "->");
if (rule.length > 1) {
String source = rule[0];
String target = rule[1];
nodeTree.insert(source, target);
} else {
nodeTree.insert(e);
}
});
}
@Test
public void searchTarget() {
//没有找到任何敏感词
Assert.assertTrue(nodeTree.searchTarget("tree").equals("tree"));
Assert.assertTrue(nodeTree.searchTarget("ello").equals("ello"));
Assert.assertTrue(nodeTree.searchTarget("ba").equals("ba"));
//匹配整个敏感词,全部替换
Assert.assertEquals(nodeTree.searchTarget("bad"), "ungood");
Assert.assertEquals(nodeTree.searchTarget("hello"), "hi");
//匹配整个敏感词,但是没有指定替换值
Assert.assertEquals(nodeTree.searchTarget("test"), "XXXX");
Assert.assertEquals(nodeTree.searchTarget("nice"), "XXXX");
//匹配已某个敏感词前缀
Assert.assertEquals(nodeTree.searchTarget("helloo"), "XXXXXX");
Assert.assertEquals(nodeTree.searchTarget("testo"), "XXXXX");
//匹配已某个敏感词后缀
Assert.assertEquals(nodeTree.searchTarget("tbad"), "XXXX");
Assert.assertEquals(nodeTree.searchTarget("atest"), "XXXXX");
//当敏感词在中间时,不做处理
Assert.assertEquals(nodeTree.searchTarget("atesto"), "atesto");
Assert.assertEquals(nodeTree.searchTarget("tbadt"), "tbadt");
}
@Test
public void insert() {
//当新增已存在结点时,新增失败
Assert.assertFalse(nodeTree.insert("test"));
//当新增不存在结点时,新增成功
Assert.assertTrue(nodeTree.insert("tttt"));
//新增结点
Assert.assertTrue(nodeTree.searchTarget("wwX").equals("wwX"));
nodeTree.insert("wwX");
Assert.assertEquals(nodeTree.searchTarget("wwX"), "XXX");
//新增结点前缀匹配
Assert.assertTrue(nodeTree.searchTarget("withTarget").equals("withTarget"));
nodeTree.insert("with", "aaaa");
Assert.assertEquals(nodeTree.searchTarget("withT"), "XXXXX");
}
@Test
public void update() {
//当修改已存在结点时,可以修改
Assert.assertTrue(nodeTree.update("test", "test1"));
//当修改已存在结点时,不可以修改
Assert.assertFalse(nodeTree.update("testee", "test1"));
//已改以后,之前的target将改变
Assert.assertEquals(nodeTree.searchTarget("agree"), "crimestop");
nodeTree.update("agree", "wwww");
Assert.assertEquals(nodeTree.searchTarget("agree"), "wwww");
}
@Test
public void delete() {
//当移除已存在结点时,可以移除
nodeTree.insert("my");
Assert.assertEquals(nodeTree.searchTarget("my"), "XX");
Assert.assertTrue(nodeTree.delete("my"));
Assert.assertEquals(nodeTree.searchTarget("my"), "my");
//当移除不存在结点时,不可以移除
Assert.assertFalse(nodeTree.delete("my"));
//当移除已存在结点时,若没有子结点,会冲父级移除此key
nodeTree.insert("bada");
Assert.assertEquals(nodeTree.query("bad").getChildNodes().size(), 1);
Assert.assertTrue(nodeTree.delete("bada"));
Assert.assertEquals(nodeTree.query("bad").getChildNodes().size(), 0);
}
@Test
public void filter() {
String t1 = "he is a nice bad test boy";
String q1 = "he is a XXXX ungood XXXX boy";
Assert.assertEquals(nodeTree.filter(t1), q1);
//不包含符号
String t2 = "bad test";
String q2 = "ungood XXXX";
Assert.assertEquals(nodeTree.filter(t2), q2);
//包含符号
String t3 = "bad test!";
String q3 = "ungood XXXX!";
Assert.assertEquals(nodeTree.filter(t3), q3);
//前缀
String t4 = "badqq test!";
String q4 = "XXXXX XXXX!";
Assert.assertEquals(nodeTree.filter(t4), q4);
//中间
String t5 = "qqbadqq atesta!";
String q5 = "qqbadqq atesta!";
Assert.assertEquals(nodeTree.filter(t5), q5);
nodeTree.searchTarget("tbad");
nodeTree.searchTarget("qqbad");
//后缀
String t6 = "qqbad atesta";
String q6 = "XXXXX atesta";
Assert.assertEquals(nodeTree.filter(t6), q6);
String t7 = "bad, atesta";
String q7 = "ungood, atesta";
Assert.assertEquals(nodeTree.filter(t7), q7);
}
}
敏感词初始化文件
map.txt写入以下内容。
bad->ungood
better->gooder
agree->crimestop
hello->hi
test
nice
结构截图
结束语
这个是我的实现方法,这个方案发送给某电商,结果没有接收到面试邀请,估计有更好的实现方案,我没有想到,欢迎讨论。