需求简述:
从句子中提取标准词库出现的词语(稍作调整即可记录位置/词性等信息)
示例:
输入:
text: 问渠那得清如许, 为有源头活水来, 问渠
dictionary: ['问渠', '问渠那', '清如许', '活水']
输出:
['问渠那', '清如许', '活水', '问渠']
package kim.nzxy;
import java.util.*;
/**
* @author ly-chn
*/
public class LyKeywordsExtractor {
private final Trie trie;
private final int maxLength;
public LyKeywordsExtractor(Set<String> dictionary) {
this.trie = new Trie();
int maxLength = 0;
for (String word : dictionary) {
maxLength = Math.max(maxLength, word.length());
trie.insert(word);
}
this.maxLength = maxLength;
}
public static void main(String[] args) {
Set<String> standardKeywords = new HashSet<>();
standardKeywords.add("问渠");
standardKeywords.add("渠那");
standardKeywords.add("清如许");
String text = "问渠那得清如许, 为有源头活水来, 问渠那得清如许, 为有源头活水来";
LyKeywordsExtractor extractor = new LyKeywordsExtractor(standardKeywords);
Set<String> extractedKeywords = extractor.segment(text);
System.out.println(extractedKeywords);
}
public Set<String> segment(String text) {
Set<String> words = new LinkedHashSet<>();
int i = 0;
while (i < text.length()) {
String maxWord = "";
int maxWordLength = Math.min(text.length(), i + maxLength);
for (int j = i + 1; j <= maxWordLength; j++) {
String word = text.substring(i, j);
if (trie.find(word) && word.length() > maxWord.length()) {
maxWord = word;
}
}
if (!maxWord.isEmpty()) {
words.add(maxWord);
i += maxWord.length();
} else {
i++;
}
}
return words;
}
static class TrieNode {
Map<Character, TrieNode> children;
boolean isWord;
TrieNode() {
children = new HashMap<>();
isWord = false;
}
}
static class Trie {
TrieNode root;
Trie() {
root = new TrieNode();
}
void insert(String word) {
TrieNode node = root;
for (char c : word.toCharArray()) {
if (!node.children.containsKey(c)) {
node.children.put(c, new TrieNode());
}
node = node.children.get(c);
}
node.isWord = true;
}
boolean find(String word) {
TrieNode node = root;
for (char c : word.toCharArray()) {
if (!node.children.containsKey(c)) {
return false;
}
node = node.children.get(c);
}
return node.isWord;
}
}
}
简述一下流程:
首先构建一下字典树, 然后从给定字符串的每个索引位置往后检索, 判断是否在字典中出现, 如果出现(如果出现多个, 则仅提取最长的那个), 则列入结果集