中文前缀分词器
索引
GET /_analyze
{
"tokenizer": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"text": [
"北京字节跳动"
]
}
搜索
GET /_analyze
{
"tokenizer": "keyword",
"text": [
"北京字节跳动"
]
}
结果
# 索引
{
"tokens" : [
{
"token" : "北",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "北京",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "北京字",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "北京字节",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "北京字节跳",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "北京字节跳动",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 5
}
]
}
# 搜索
GET /_analyze
{
"tokenizer": "keyword",
"text": [
"北京字节跳动"
]
}
中文中缀分词器
索引
GET /_analyze
{
"tokenizer": "standard",
"filter": ["lowercase"],
"text": [
"北京字节跳动"
]
}
搜索
同索引
结果
{
"tokens" : [
{
"token" : "北",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "京",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "字",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "节",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "跳",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "动",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
}
]
}
拼音全拼前缀分词器
索引
GET /_analyze
{
"tokenizer": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"filter": [
{
"type": "pinyin",
"keep_original": false,
"keep_first_letter": false,
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_none_chinese_together": true,
"keep_none_chinese_in_joined_full_pinyin": true,
"none_chinese_pinyin_tokeniz": false,
"keep_none_chinese": false,
"ignore_pinyin_offset": false
}
],
"text": [
"北京字节跳动"
]
}
搜索
GET /_analyze
{
"tokenizer": "keyword",
"filter": [
{
"type": "pinyin",
"keep_original": false,
"keep_first_letter": false,
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_none_chinese_together": true,
"keep_none_chinese_in_joined_full_pinyin": true,
"none_chinese_pinyin_tokeniz": false,
"keep_none_chinese": false,
"ignore_pinyin_offset": false
}
],
"text": [
"北京"
]
}
结果
# 索引
{
"tokens" : [
{
"token" : "bei",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "beijing",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "beijingzi",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "beijingzijie",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "beijingzijietiao",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "beijingzijietiaodong",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 5
}
]
}
# 搜索
{
"tokens" : [
{
"token" : "beijing",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 0
}
]
}
拼音全拼中缀分词器
索引
GET /_analyze
{
"tokenizer": {
"type": "pinyin",
"keep_original": false,
"keep_first_letter": false,
"keep_full_pinyin": true,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
},
"text": [
"北京字节跳动"
]
}
搜索
GET /_analyze
{
"tokenizer": "keyword",
"filter": [
{
"type": "pinyin",
"keep_original": false,
"keep_first_letter": false,
"keep_full_pinyin": true,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
}
],
"text": [
"北京"
]
}
结果
# 索引
{
"tokens" : [
{
"token" : "bei",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "jing",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "zi",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "jie",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "tiao",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "dong",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 5
}
]
}
# 搜索
{
"tokens" : [
{
"token" : "bei",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 0
},
{
"token" : "jing",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
}
]
}
拼音首字母前缀分词器
索引
GET /_analyze
{
"tokenizer": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"filter": [
{
"type": "pinyin",
"keep_original": false,
"keep_full_pinyin": false,
"limit_first_letter_length": 50,
"none_chinese_pinyin_tokeniz": false,
"keep_none_chinese": false,
"ignore_pinyin_offset": false
}
],
"text": [
"北京字节跳动"
]
}
搜索
GET /_analyze
{
"tokenizer": "keyword",
"filter": [
{
"type": "pinyin",
"keep_original": false,
"keep_full_pinyin": false,
"limit_first_letter_length": 50,
"none_chinese_pinyin_tokeniz": false,
"keep_none_chinese": false,
"ignore_pinyin_offset": false
}
],
"text": [
"北京"
]
}
结果
# 索引
{
"tokens" : [
{
"token" : "b",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "bj",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "bjz",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "bjzj",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "bjzjt",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "bjzjtd",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 5
}
]
}
# 搜索
{
"tokens" : [
{
"token" : "bj",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 0
}
]
}
拼音首字母中缀分词器
索引
GET /_analyze
{
"tokenizer": {
"type": "pinyin",
"keep_original": false,
"keep_separate_first_letter": true,
"keep_first_letter": false,
"keep_full_pinyin": false,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
},
"text": [
"北京字节跳动"
]
}
搜索
GET /_analyze
{
"tokenizer": "keyword",
"filter": [
{
"type": "pinyin",
"keep_original": false,
"keep_separate_first_letter": true,
"keep_first_letter": false,
"keep_full_pinyin": false,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
}
],
"text": [
"北京"
]
}
结果
# 索引
{
"tokens" : [
{
"token" : "b",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "j",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "z",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "j",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "t",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "d",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 5
}
]
}
# 搜索
{
"tokens" : [
{
"token" : "b",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 0
},
{
"token" : "j",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
}
]
}
实战
建模
PUT /ourea-home-suggestion-v15
{
"settings": {
"analysis": {
"analyzer": {
"lowercase_standard": {
"tokenizer": "standard",
"filter": "lowercase"
},
"prefix_index_analyzer": {
"tokenizer": "edge_ngram_tokenizer"
},
"full_pinyin_index_analyzer": {
"tokenizer": "full_pinyin_tokenizer"
},
"full_pinyin_prefix_index_analyzer": {
"tokenizer": "edge_ngram_tokenizer",
"filter": [
"full_pinyin_prefix_filter"
]
},
"first_letter_prefix_index_analyzer": {
"tokenizer": "edge_ngram_tokenizer",
"filter": [
"first_letter_prefix_filter"
]
},
"first_letter_index_analyzer": {
"tokenizer": "first_letter_tokenizer"
},
"full_pinyin_search_analyzer": {
"tokenizer": "keyword",
"filter": [
"full_pinyin_filter"
]
},
"full_pinyin_prefix_search_analyzer": {
"tokenizer": "keyword",
"filter": [
"full_pinyin_prefix_filter"
]
},
"first_letter_prefix_search_analyzer": {
"tokenizer": "keyword",
"filter": [
"first_letter_prefix_filter"
]
},
"first_letter_search_analyzer": {
"tokenizer": "keyword",
"filter": [
"first_letter_filter"
]
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"full_pinyin_tokenizer": {
"type": "pinyin",
"keep_original": false,
"keep_first_letter": false,
"keep_full_pinyin": true,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
},
"first_letter_tokenizer": {
"type": "pinyin",
"keep_original": false,
"keep_separate_first_letter": true,
"keep_first_letter": false,
"keep_full_pinyin": false,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
}
},
"filter": {
"full_pinyin_filter": {
"type": "pinyin",
"keep_original": false,
"keep_first_letter": false,
"keep_full_pinyin": true,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
},
"full_pinyin_prefix_filter": {
"type": "pinyin",
"keep_original": false,
"keep_first_letter": false,
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_none_chinese_together": true,
"keep_none_chinese_in_joined_full_pinyin": true,
"none_chinese_pinyin_tokeniz": false,
"keep_none_chinese": false,
"ignore_pinyin_offset": false
},
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"first_letter_filter": {
"type": "pinyin",
"keep_original": false,
"keep_separate_first_letter": true,
"keep_first_letter": false,
"keep_full_pinyin": false,
"none_chinese_pinyin_tokeniz": false,
"ignore_pinyin_offset": false
},
"first_letter_prefix_filter": {
"type": "pinyin",
"keep_original": false,
"keep_full_pinyin": false,
"limit_first_letter_length": 50,
"none_chinese_pinyin_tokeniz": false,
"keep_none_chinese": false,
"ignore_pinyin_offset": false
}
}
}
},
"mappings": {
"properties": {
"name": {
"type": "text",
"analyzer": "keyword",
"fields": {
"standard": {
"type": "text",
"analyzer": "lowercase_standard"
},
"prefix": {
"type": "text",
"analyzer": "prefix_index_analyzer"
},
"full_pinyin": {
"type": "text",
"analyzer": "full_pinyin_index_analyzer",
"search_analyzer": "full_pinyin_search_analyzer",
"fields": {
"prefix": {
"type": "text",
"analyzer": "full_pinyin_prefix_index_analyzer",
"search_analyzer": "full_pinyin_prefix_search_analyzer"
}
}
},
"first_letter": {
"type": "text",
"analyzer": "first_letter_index_analyzer",
"search_analyzer": "first_letter_search_analyzer",
"fields": {
"prefix": {
"type": "text",
"analyzer": "first_letter_prefix_index_analyzer",
"search_analyzer": "first_letter_prefix_search_analyzer"
}
}
}
}
},
"status": {
"type": "short"
},
"type": {
"type": "short"
},
"top": {
"type": "short"
},
"onlined": {
"type": "short"
},
"sequence": {
"type": "double"
}
}
}
}
DSL
GET /ourea-home-suggestion/_search
{
"query": {
"bool": {
"filter": [
{
"term": {
"onlined": {
"value": 1,
"boost": 1
}
}
}
],
"should": [
{
"term": {
"name.prefix": {
"value": "C++软件",
"boost": 10
}
}
},
{
"match_phrase": {
"name.standard": {
"query": "C++软件",
"slop": 0,
"zero_terms_query": "NONE",
"boost": 5
}
}
},
{
"bool": {
"filter": [
{
"match_phrase_prefix": {
"name.full_pinyin.prefix": {
"query": "C++软件",
"analyzer": "full_pinyin_prefix_search_analyzer",
"slop": 0,
"max_expansions": 100,
"zero_terms_query": "NONE",
"boost": 1
}
}
}
],
"should": [
{
"match_phrase_prefix": {
"name.full_pinyin": {
"query": "C++软件",
"analyzer": "full_pinyin_search_analyzer",
"slop": 0,
"max_expansions": 50,
"zero_terms_query": "NONE",
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"minimum_should_match": "1",
"boost": 3
}
},
{
"match_phrase_prefix": {
"name.full_pinyin": {
"query": "C++软件",
"analyzer": "full_pinyin_search_analyzer",
"slop": 0,
"max_expansions": 50,
"zero_terms_query": "NONE",
"boost": 1.5
}
}
},
{
"match": {
"name.first_letter.prefix": {
"query": "C++软件",
"operator": "OR",
"analyzer": "first_letter_prefix_search_analyzer",
"prefix_length": 0,
"max_expansions": 100,
"fuzzy_transpositions": true,
"lenient": false,
"zero_terms_query": "NONE",
"auto_generate_synonyms_phrase_query": true,
"boost": 1
}
}
},
{
"match_phrase": {
"name.first_letter": {
"query": "C++软件",
"analyzer": "first_letter_search_analyzer",
"slop": 0,
"zero_terms_query": "NONE",
"boost": 0.8
}
}
}
],
"adjust_pure_negative": true,
"minimum_should_match": "1",
"boost": 1
}
},
"highlight": {
"type": "plain",
"fields": {
"name.prefix": {},
"name.standard": {},
"name.full_pinyin": {},
"name.first_letter.prefix": {},
"name.first_letter": {}
}
}
}