分词器介绍
ik
- 中文分词器,免费,使用方便,自带词库,可以指定自定义词库
- ik_smart 智能分词,较max分词粒度更粗,结果数更少。一般在创建索引时,对大量文本的内容,使用smart分词
- ik_max_word 尽可能多的分词,一般搜索时对检索条件使用max
ik-pinyin
ngram
- elasticsearch自带的分词器,中英文都可以使用
- 直接对内容分词,会按照字数拆分,分为2种
- 可以巧妙的实现模糊匹配,可以做类似SQL里like的操作
代码示例
说明
- elasticsearch5.5,使用bboss工具包,此方法对版本没要求
- ngram是Elasticsearch自带的,不需要额外安装
- ik 和 ik-pinyin,需要自己安装可以参考文档:
- 本次分词接口是使用http接口查询,直接使用es安装环境的配置,不需要在项目里配置字典路径等
- 本次是使用bboss工具包里的http工具类,也可以使用自己的工具类,只要能发送http请求都可以
详细代码
public static Object wordSplit(String tokenizer,Integer min, Integer max,String content){
ClientInterface clientUtil = ElasticSearchHelper.getRestClientUtil();
if(StringUtils.isEmpty(tokenizer)){
tokenizer = "ik_smart";
}
if(null == min){
min = 2;
}
if(null == max){
max = 7;
}
Set<String> result = new HashSet<>();
StringBuffer requestStr = new StringBuffer();
if("ngram".equals(tokenizer)){
requestStr.append("{\"tokenizer\":").append("{\"max_gram\":").append(max).append(",\"min_gram\":")
.append(min).append(",\"token_chars\":[\"letter\",\"digit\"],\"type\":\"ngram\"}");
}else {
requestStr.append("{\"tokenizer\": \"").append(tokenizer).append("\"");
}
requestStr.append(", \"text\": \"").append(content).append("\"}");
String analysisResult = clientUtil.executeHttp("/_analyze",requestStr.toString(),"post");
JSONObject jsonObject = JSON.parseObject(analysisResult);
JSONArray tokens = jsonObject.getJSONArray("tokens");
for (Object object : tokens){
JSONObject json = (JSONObject)object;
String type = json.getString("type");
if("CN_WORD".equalsIgnoreCase(type) || "word".equalsIgnoreCase(type)){
String word = json.getString("token");
if(word.length() >= min && word.length() <= max){
result.add(word);
}
}
}
esLog.info(analysisResult);
esLog.info(JSON.toJSONString(result));
return result;
}
部分场景测试结果示例
1,wordSplit("ik_smart",2,7,"中华人民共和国");
{"tokens":[{"token":"中华人民共和国","start_offset":0,"end_offset":7,"type":"CN_WORD","position":0}]}
["中华人民共和国"]
2,wordSplit("ik_max_word",2,7,"中华人民共和国");
{"tokens":[{"token":"中华人民共和国","start_offset":0,"end_offset":7,"type":"CN_WORD","position":0},{"token":"中华人民","start_offset":0,"end_offset":4,"type":"CN_WORD","position":1},{"token":"中华","start_offset":0,"end_offset":2,"type":"CN_WORD","position":2},{"token":"华人","start_offset":1,"end_offset":3,"type":"CN_WORD","position":3},{"token":"人民共和国","start_offset":2,"end_offset":7,"type":"CN_WORD","position":4},{"token":"人民","start_offset":2,"end_offset":4,"type":"CN_WORD","position":5},{"token":"共和国","start_offset":4,"end_offset":7,"type":"CN_WORD","position":6},{"token":"共和","start_offset":4,"end_offset":6,"type":"CN_WORD","position":7},{"token":"国","start_offset":6,"end_offset":7,"type":"CN_CHAR","position":8}]}
["共和","中华人民共和国","中华人民","华人","中华","人民共和国","人民","共和国"]
3,wordSplit("ngram",2,7,"中华人民共和国");
{"tokens":[{"token":"中华","start_offset":0,"end_offset":2,"type":"word","position":0},{"token":"中华人","start_offset":0,"end_offset":3,"type":"word","position":1},{"token":"中华人民","start_offset":0,"end_offset":4,"type":"word","position":2},{"token":"中华人民共","start_offset":0,"end_offset":5,"type":"word","position":3},{"token":"中华人民共和","start_offset":0,"end_offset":6,"type":"word","position":4},{"token":"中华人民共和国","start_offset":0,"end_offset":7,"type":"word","position":5},{"token":"华人","start_offset":1,"end_offset":3,"type":"word","position":6},{"token":"华人民","start_offset":1,"end_offset":4,"type":"word","position":7},{"token":"华人民共","start_offset":1,"end_offset":5,"type":"word","position":8},{"token":"华人民共和","start_offset":1,"end_offset":6,"type":"word","position":9},{"token":"华人民共和国","start_offset":1,"end_offset":7,"type":"word","position":10},{"token":"人民","start_offset":2,"end_offset":4,"type":"word","position":11},{"token":"人民共","start_offset":2,"end_offset":5,"type":"word","position":12},{"token":"人民共和","start_offset":2,"end_offset":6,"type":"word","position":13},{"token":"人民共和国","start_offset":2,"end_offset":7,"type":"word","position":14},{"token":"民共","start_offset":3,"end_offset":5,"type":"word","position":15},{"token":"民共和","start_offset":3,"end_offset":6,"type":"word","position":16},{"token":"民共和国","start_offset":3,"end_offset":7,"type":"word","position":17},{"token":"共和","start_offset":4,"end_offset":6,"type":"word","position":18},{"token":"共和国","start_offset":4,"end_offset":7,"type":"word","position":19},{"token":"和国","start_offset":5,"end_offset":7,"type":"word","position":20}]}
["华人民共和国","华人","华人民","中华","人民共和国","民共","民共和国","中华人民共和","人民共","中华人民共","共和","华人民共和","民共和","中华人民","中华人民共和国","中华人","人民共和","和国","人民","共和国","华人民共"]