elasticsearch使用中文分词器和拼音分词器,自定义分词器
1. 到github 下载分词器
上面有已经编译好打好的包。下载后在es安装目录下的plugins/目录下创建ik和pinyin两个文件夹,把下载好的zip包解压在里面。重启es就会生效了。github上readme.txt文件里有使用说明。注意下载的时候下载版本对应的,比如我的es版本是5.6.16,下载分词器的时候也要下载这个版本的。
ik 中文分词器:https://github.com/medcl/elasticsearch-analysis-ik/releases
pinyin 拼音分词器:https://github.com/medcl/elasticsearch-analysis-pinyin/releases
也可以下载源码后,用mvn手动打包,但是特别慢,我打了个拼音包两个多小时,可能和没翻墙也有关系。
2. 使用分词器
解压后重启es就可以使用了。分词器是属于索引的,所以测试分词器的时候,要指定是哪个索引。
ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。
get http://localhost:9200/user_index/_analyze?analyzer=ik_smart&text=张三李四
返回
{
"tokens": [
{
"token": "张三李四",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
}
]
}
ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query;
get http://localhost:9200/user_index/_analyze?analyzer=ik_max_word&text=张三李四
返回
{
"tokens": [
{
"token": "张三李四",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
},
{
"token": "张三",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 1
},
{
"token": "三",
"start_offset": 1,
"end_offset": 2,
"type": "TYPE_CNUM",
"position": 2
},
{
"token": "李四",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 3
},
{
"token": "四",
"start_offset": 3,
"end_offset": 4,
"type": "TYPE_CNUM",
"position": 4
}
]
}
get http://localhost:9200/user_index/_analyze?analyzer=pinyin&text=张三李四
返回
{
"tokens": [
{
"token": "zhang",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "zsls",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "san",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "li",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "si",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 3
}
]
}
3. 自定义分词器,ik+pinyin组合使用
ik中文分词器,貌似没有可以设置的属性,直接用就行了。
拼音分词器有许多可以设置的选项。可以自行定义。原本的拼音分词器,只能分析出来全拼、首字母全拼、和每个字的完整拼音,不过这个每个字的完整拼音我觉得没什么作用,太细微。我想实现的功能是,可以让中文分词器分词后的字词,再被拼音分词器分词,就可以用下面的方式,tokenizer 使用 中文分词器ik_max_word,最后的标记过滤器,再使用pinyin 分词器过滤一遍就可以了。
{
"index": {
"number_of_replicas" : "0",
"number_of_shards" : "1",
"analysis": {
"analyzer": {
"ik_pinyin_analyzer": {
"tokenizer": "my_ik_pinyin",
"filter": "pinyin_first_letter_and_full_pinyin_filter"
},
"pinyin_analyzer": {
"tokenizer": "my_pinyin"
}
},
"tokenizer": {
"my_ik_pinyin": {
"type": "ik_max_word"
},
"my_pinyin": {
"type": "pinyin",
"keep_first_letter": true,
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_none_chinese": true,
"none_chinese_pinyin_tokenize": false,
"keep_none_chinese_in_joined_full_pinyin": true,
"keep_original": false,
"limit_first_letter_length": 16,
"lowercase": true,
"trim_whitespace": true,
"remove_duplicated_term": true
}
},
"filter": {
"pinyin_first_letter_and_full_pinyin_filter": {
"type": "pinyin",
"keep_first_letter": true,
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_none_chinese": true,
"none_chinese_pinyin_tokenize": false,
"keep_none_chinese_in_joined_full_pinyin": true,
"keep_original": false,
"limit_first_letter_length": 16,
"lowercase": true,
"trim_whitespace": true,
"remove_duplicated_term": true
}
}
}
}
}
我们测试一下
http://localhost:9200/drug_index/_analyze?analyzer=ik_pinyin_analyzer&text=阿莫西林胶囊
返回的结果就是汉字ik_max_word分词后的结果,再按照拼音分词的规则做了分析。
{
"tokens": [
{
"token": "amoxilin",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
},
{
"token": "amxl",
"start_offset": 0,
"end_offset": 4,
"type": "CN_WORD",
"position": 0
},
{
"token": "moxi",
"start_offset": 1,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "mx",
"start_offset": 1,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "xilin",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 2
},
{
"token": "xl",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 2
},
{
"token": "jiaonang",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 3
},
{
"token": "jn",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 3
}
]
}
4. 代码测试
package com.boot.es.model;
import lombok.Data;
import org.springframework.data.annotation.Id;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;
import org.springframework.data.elasticsearch.annotations.InnerField;
import org.springframework.data.elasticsearch.annotations.MultiField;
import org.springframework.data.elasticsearch.annotations.Setting;
/**
* Author: susq
* Date: 2019-06-30 10:12
*/
@Data
@Document(indexName = "drug_index", type = "drug")
@Setting(settingPath = "settings.json")
public class Drug {
@Id
private Long id;
@Field(type = FieldType.Keyword)
private String price;
@MultiField(
mainField = @Field(type = FieldType.Keyword),
otherFields = {
@InnerField(type = FieldType.Text, suffix = "ik", analyzer = "ik_max_word", searchAnalyzer = "ik_max_word"),
@InnerField(type = FieldType.Text, suffix = "ik_pinyin", analyzer = "ik_pinyin_analyzer", searchAnalyzer = "ik_pinyin_analyzer"),
@InnerField(type = FieldType.Text, suffix = "pinyin", analyzer = "pinyin_analyzer", searchAnalyzer = "pinyin_analyzer")
}
)
private String name;
@MultiField(
mainField = @Field(type = FieldType.Keyword),
otherFields = {
@InnerField(type = FieldType.Text, suffix = "ik", analyzer = "ik_max_word", searchAnalyzer = "ik_smart"),
@InnerField(type = FieldType.Text, suffix = "ik_pinyin", analyzer = "ik_pinyin_analyzer", searchAnalyzer = "ik_pinyin_analyzer"),
@InnerField(type = FieldType.Text, suffix = "pinyin", analyzer = "pinyin_analyzer", searchAnalyzer = "pinyin_analyzer")
}
)
private String effect;
}
@Test
public void drugSaveTest() {
Drug drug = new Drug();
drug.setId(1L);
drug.setName("阿莫西林胶囊");
drug.setPrice("10");
drug.setEffect("阿莫西林适用于敏感菌(不产β内酰胺酶菌株)所致的感染");
Drug drug1 = new Drug();
drug1.setId(3L);
drug1.setName("阿莫西林");
drug1.setPrice("10");
drug1.setEffect("阿莫西林适用于敏感菌(不产β内酰胺酶菌株)所致的感染");
Drug drug2 = new Drug();
drug2.setId(2L);
drug2.setName("999感冒灵颗粒");
drug2.setPrice("20");
drug2.setEffect("本品解热镇痛。用于感冒引起的头痛,发热,鼻塞,流涕,咽痛等");
drugRepository.saveAll(Lists.newArrayList(drug, drug1, drug2));
List<Drug> drugs = Lists.newArrayList(drugRepository.findAll());
log.info("以保存的drugs: {}", drugs);
}
@Test
public void drugSaveTest() {
Drug drug = new Drug();
drug.setId(1L);
drug.setName("阿莫西林胶囊");
drug.setPrice("10");
drug.setEffect("阿莫西林适用于敏感菌(不产β内酰胺酶菌株)所致的感染");
Drug drug1 = new Drug();
drug1.setId(3L);
drug1.setName("阿莫西林");
drug1.setPrice("10");
drug1.setEffect("阿莫西林适用于敏感菌(不产β内酰胺酶菌株)所致的感染");
Drug drug2 = new Drug();
drug2.setId(2L);
drug2.setName("999感冒灵颗粒");
drug2.setPrice("20");
drug2.setEffect("本品解热镇痛。用于感冒引起的头痛,发热,鼻塞,流涕,咽痛等");
drugRepository.saveAll(Lists.newArrayList(drug, drug1, drug2));
List<Drug> drugs = Lists.newArrayList(drugRepository.findAll());
log.info("以保存的drugs: {}", drugs);
}
/**
* 这个测试中,name(不带后缀的时候是Keyword类型),不分词的时候,如果能匹配到 * 那就是完全匹配,应该要得分高一点,所以设置是match查询的两倍
*/
@Test
public void drugIkSearchTest() {
NativeSearchQueryBuilder builder = new NativeSearchQueryBuilder();
NativeSearchQuery query = builder.withQuery(QueryBuilders.boolQuery()
.should(QueryBuilders.matchQuery("name", "阿莫西林")).boost(2)
.should(QueryBuilders.matchQuery("name.ik", "阿莫西林")).boost(1))
.build();
log.info("DSL:{}", query.getQuery().toString());
Iterable<Drug> iterable = drugRepository.search(query);
List<Drug> drugs = Lists.newArrayList(iterable);
log.info("result: {}", drugs);
}
/**
* 这个测试中,name.pinyin(只生成整个name的全拼和所有汉字首字母的全拼接), * 这个匹配的时候就是完全匹配,得分应该高一点
*/
@Test
public void drugPinyinSearchTest() {
NativeSearchQueryBuilder builder = new NativeSearchQueryBuilder();
NativeSearchQuery query = builder.withQuery(QueryBuilders.boolQuery()
.should(QueryBuilders.matchQuery("name.ik_pinyin", "阿莫西林").boost(1))
.should(QueryBuilders.matchQuery("name.pinyin", "阿莫西林").boost(2))
)
.withSort(SortBuilders.scoreSort())
.build();
log.info("DSL:{}", query.getQuery().toString());
Iterable<Drug> iterable = drugRepository.search(query);
List<Drug> drugs = Lists.newArrayList(iterable);
log.info("result: {}", drugs);
}