代码记录:
private static final Pattern SPECIAL_CHAR_PATTERN = Pattern.compile(".*[`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】\\-‘;:”“’。,、?\\\\]+.*");
/**
* 自定义分词器
*/
private String customIk(String text) {
StringBuilder word2Frequency = new StringBuilder();
// 使用 IKSegmenter 初始化文本信息并加载词典
IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(text), true);
Lexeme lex;
while (true) {
try {
if ((lex = ikSegmenter.next()) == null) {
break;
}
String word = StringUtils.trimToEmpty(lex.getLexemeText());
if (StringUtils.isBlank(word)) {
continue;
} // 过滤一些高频率的符号
else if (word.length() < 3 && SPECIAL_CHAR_PATTERN.matcher(word).matches()) {
continue;
}
// 此处过滤长度为1&&不是数字的str,可以根据自己需求定义
else if (word.length() < 2 && !Character.isDigit(word.charAt(0))) {
continue;
}
word2Frequency.append(word).append(",");
} catch (IOException e) {
log.error("customIk.自定义分词异常", e);
}
}
if(word2Frequency.length() == 0){
word2Frequency.append(text);
}
log.info("customIk.自定义分词结果={}", word2Frequency);
return word2Frequency.toString();
}