之前已经分别单独介绍了中文分词器(IK)和 拼音分词器 的使用方式,本节重点介绍将中文分词器和拼音分词器结合使用的方式,即针对同一个字段即可用中文分词检索,也可以用拼音检索。
废话不多说,直接上配置
PUT /pinyin-ik-test/
{
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter":"my_filter"
}
},
"filter":{
"my_filter":{
"type": "pinyin",
"keep_separate_first_letter": false,
"keep_full_pinyin": true,
"keep_original": true,
"limit_first_letter_length": 16,
"lowercase": true,
"remove_duplicated_term": true
}
}
}
}
}
配置说明
analyzer的详细配置这里不做介绍,重点使用的三个配置项:type,tokenizer、filter
实现中文分词和拼音结合的思路:通过IK分词后得到的结果再经过 pinyin 的 fiter 处理得到分词的拼音,效果如下:
GET /pinyin-ik-test/_analyze
{
"text": ["刘德华是著名的歌星、影星、慈善家、明星"],
"analyzer": "my_analyzer"
}
{
"tokens": [
{
"token": "liu",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "刘德华",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "ldh",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "de",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "hua",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 2
},
{
"token": "shi",
"start_offset": 3,
"end_offset": 4,
"type": "CN_CHAR",
"position": 3
},
{
"token": "是",
"start_offset": 3,
"end_offset": 4,
"type": "CN_CHAR",
"position": 3
},
{
"token": "s",
"start_offset": 3,
"end_offset": 4,
"type": "CN_CHAR",
"position": 3
},
{
"token": "zhu",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 4
},
{
"token": "ming",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 5
},
{
"token": "著名",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 5
},
{
"token": "zm",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 5
},
{
"token": "de",
"start_offset": 6,
"end_offset": 7,
"type": "CN_CHAR",
"position": 6
},
{
"token": "的",
"start_offset": 6,
"end_offset": 7,
"type": "CN_CHAR",
"position": 6
},
{
"token": "d",
"start_offset": 6,
"end_offset": 7,
"type": "CN_CHAR",
"position": 6
},
{
"token": "ge",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 7
},
{
"token": "xing",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 8
},
{
"token": "歌星",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 8
},
{
"token": "gx",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 8
},
{
"token": "ying",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 9
},
{
"token": "xing",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 10
},
{
"token": "影星",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 10
},
{
"token": "yx",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 10
},
{
"token": "ci",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 11
},
{
"token": "shan",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 12
},
{
"token": "jia",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 13
},
{
"token": "慈善家",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 13
},
{
"token": "csj",
"start_offset": 13,
"end_offset": 16,
"type": "CN_WORD",
"position": 13
},
{
"token": "ci",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 14
},
{
"token": "shan",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 15
},
{
"token": "慈善",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 15
},
{
"token": "cs",
"start_offset": 13,
"end_offset": 15,
"type": "CN_WORD",
"position": 15
},
{
"token": "jia",
"start_offset": 15,
"end_offset": 16,
"type": "CN_CHAR",
"position": 16
},
{
"token": "家",
"start_offset": 15,
"end_offset": 16,
"type": "CN_CHAR",
"position": 16
},
{
"token": "j",
"start_offset": 15,
"end_offset": 16,
"type": "CN_CHAR",
"position": 16
},
{
"token": "ming",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 17
},
{
"token": "xing",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 18
},
{
"token": "明星",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 18
},
{
"token": "mx",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 18
}
]
}
配置index
POST /pinyin-ik-test/DOC/_mapping
{
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer"
}
}
}
效果测试
#写入数据
POST /pinyin-ik-test/DOC/
{
"content":"刘德华是著名的明星"
}
POST /pinyin-ik-test/DOC/
{
"content":"刘德华是著名的影星"
}
POST /pinyin-ik-test/DOC/
{
"content":"刘德华是著名的慈善家"
}
POST /pinyin-ik-test/DOC/
{
"content":"刘德华是著名的明星"
}
POST /pinyin-ik-test/DOC/
{
"content":"刘德华是著名的人"
}
#用中文 明星 检索
POST /pinyin-ik-test/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "明星"
}
}
]
}
}
}
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.36661926,
"hits": [
{
"_index": "pinyin-ik-test",
"_type": "DOC",
"_id": "gl_u23IBeaMPz9g6tTIh",
"_score": 0.36661926,
"_source": {
"content": "刘德华是著名的明星"
}
}
]
}
}
#用拼音 mingxing 检索,无任何结果,是因为没有把 keep_joined_full_pinyin 选项设置为true,默认是false
POST /pinyin-ik-test/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "mingxing"
}
}
]
}
}
}
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
#用拼音 mx 检索
POST /pinyin-ik-test/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "mx"
}
}
]
}
}
}
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.36661926,
"hits": [
{
"_index": "pinyin-ik-test",
"_type": "DOC",
"_id": "gl_u23IBeaMPz9g6tTIh",
"_score": 0.36661926,
"_source": {
"content": "刘德华是著名的明星"
}
}
]
}
}
修改配置
回顾拼音分词器中给出的配置项列表,可以看到keep_joined_full_pinyin 选项控制全拼,但默认是false关闭的。
将keep_joined_full_pinyin 设置为true,再验证一下用 mingxing 拼音检索
#新建index pinyin-ik-test-3,不能在原有基础上修改
PUT /pinyin-ik-test-3/
{
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter":"my_filter"
}
},
"filter":{
"my_filter":{
"type": "pinyin",
"keep_separate_first_letter": false,
"keep_full_pinyin": true,
"keep_original": true,
"limit_first_letter_length": 16,
"lowercase": true,
"remove_duplicated_term": true,
"keep_joined_full_pinyin": true #这里设置为true
}
}
}
}
}
#配置content字段
POST /pinyin-ik-test-3/DOC/_mapping
{
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer"
}
}
}
#写入数据
POST /pinyin-ik-test-3/DOC/
{
"content":"刘德华是著名的明星"
}
#测试
POST /pinyin-ik-test-3/DOC/_search
{
"query": {
"bool": {
"must": [
{
"term": {
"content": "mingxing"
}
}
]
}
}
}
{
"took": 9,
"timed_out": false,
"_shards": {
"total": 3,
"successful": 3,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.37939543,
"hits": [
{
"_index": "pinyin-ik-test-3",
"_type": "DOC",
"_id": "hl_823IBeaMPz9g63jJ1",
"_score": 0.37939543,
"_source": {
"content": "刘德华是著名的明星"
}
}
]
}
}