在使用过程中避免不了涉及到分词字段排序的请求接下来说一下,主要包括 排序分类和示例。
前提 1、要了解 单字段 multi-fields https://www.elastic.co/guide/en/elasticsearch/reference/7.11/multi-fields.html
2、时间或者Long 等类型不讨论 3、text 类型分词字段 如果排序 需要设置 “fielddata”: true 4、请自行了解下 es官网 sort 排序属性设置 https://www.elastic.co/guide/en/elasticsearch/reference/7.11/sort-search-results.html
字段排序分类
一、默认排序
不指定字段排序,不涉及打分时即没有设置sort ,排序顺序是 按照数据的写入先后顺序
mapping中字段设置
{
"name": {
"type": "keyword",
"fields": {
"ik_max_word_false": {
"analyzer": "ik_max_word",
"type": "text"
},
"ik_max_word_true": {
"fielddata": true,
"analyzer": "ik_max_word",
"type": "text"
}
}
}
}
插入数据指定 _id,先插入_id 为10 ,后插入_id 为7的数据
PUT /test_order/_doc/10
{"name":"张二"}
PUT /test_order/_doc/7
{"name":"张二"}
查询语句
POST /test_order/_search
{
"_source": "name",
"query": {
"bool": {
"filter": {
"term": {
"name.ik_max_word_true": "张"
}
}
}
}
}
查询部分结果
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "10",
"_score" : 0.0,
"_source" : {
"name" : "张二"
}
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "7",
"_score" : 0.0,
"_source" : {
"name" : "张二"
}
}
二、评分排序
DSL不指定字段排序但是涉及打分时 即也是没有设置sort ,排序顺序是 按照评分倒序
插入数据
PUT /test_order/_doc/11
{"name":"周正国"}
PUT /test_order/_doc/12
{"name":"周小国"}
PUT /test_order/_doc/13
{"name":"周国"}
查询数据
POST /test_order/_search
{
"_source": "name",
"query": {
"term": {
"name.ik_max_word_true": "周"
}
}
}
部分结果
"hits" : [
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "13",
"_score" : 1.605183,
"_source" : {
"name" : "周国"
}
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "11",
"_score" : 1.2199391,
"_source" : {
"name" : "周正国"
}
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "12",
"_score" : 1.2199391,
"_source" : {
"name" : "周小国"
}
}
]
三、指定排序
DSL中指定 sort 的查询,查询指定sort 后会导致评分失效
1、按照中文排序
利用ICU分词器插件,参考官网说明 https://www.elastic.co/guide/en/elasticsearch/plugins/7.11/analysis-icu-collation-keyword-field.html
字段 设置
{
"properties": {
"name": {
"type": "keyword",
"fields": {
"icu_keyword": {
"type": "icu_collation_keyword",
"index": false,
"language": "zh",
"country": "CN"
}
}
}
}
}
查询
GET /test_order/_search
{
"_source": "name",
"query": {
"match_all": {}
},"sort": [
{
"name.icu_keyword": {
"order": "desc","mode": "min","missing": "_last"
}
}
]
}
部分结果
"hits" : [
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "1",
"_score" : null,
"_source" : {
"name" : "周正国"
},
"sort" : [
"""偁某沕‐㠈ᰀ"""
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "2",
"_score" : null,
"_source" : {
"name" : "王小国"
},
"sort" : [
"""䛕撏ⲕ‐㠈ᰀ"""
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "3",
"_score" : null,
"_source" : {
"name" : "蔡大国"
},
"sort" : [
"""⪣ᚧಕ‐㠈ᰀ"""
]
}
]
结果中可以看到 sort 显示是乱码,不建议采用这种方式,在某些情况下无法确认顺序是否正确(对于中文排序,建议利用按照字母顺序排序)
2、按照字母(AZ)顺序排序
利用pinyin 分词器插件进行分词 拼音分词器中各属性的设置参考
大佬博文https://www.cnblogs.com/wulisz/p/15010558.html
实现方式一
主要就是利用 保留 首字母 “keep_first_letter”: “true”,其他为false 和keyword 分词器,使得 name 的pinyin_keyword field 值的分词结果只有首字母 。然后查询时排序就实现了字母顺序排序。
为什么要必须是keyword 分词规则,而不是ik_smart 或者ik_max ?
自定义的analyzer 中的方式是 通过analyzer 中filter 实现的,拼音转换,也就是说先分词,对分词后的词语再执行filter 转换成了拼音字符,如果是ik_max 分词,会出现乱序的情况,比如 name 数据为 李爱国,如果是ik_max 后会出现 李 爱 国,再取首字母 l a g ,那排序时 李爱国 是按照 爱的首字母a 的顺序显示的,而 不是 l,就导致了不是name 的首字符的首字母 排序了,就出现了所谓的乱序。
setting里设置分词器
PUT /test_order
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"pinyin_analyzer_keyword": {
"tokenizer": "keyword",
"filter": "my_pinyin"
},
"pinyin_analyzer_ik_smart": {
"tokenizer": "ik_smart",
"filter": "my_pinyin"
},
"pinyin_analyzer_ik_max": {
"tokenizer": "ik_max_word",
"filter": "my_pinyin"
}
},
"filter": {
"my_pinyin": {
"type": "pinyin",
"keep_first_letter": "true",
"keep_separate_first_letter": "false",
"keep_full_pinyin": "false",
"keep_joined_full_pinyin": "false",
"keep_none_chinese": "false",
"keep_none_chinese_together": "false",
"keep_none_chinese_in_first_letter": "false",
"keep_none_chinese_in_joined_full_pinyin": "false",
"none_chinese_pinyin_tokenize": "false",
"keep_original": "false",
"lowercase": "false",
"trim_whitespace": "false",
"remove_duplicated_term": "false"
}
}
}
}
}
}
mapping 设置字段
PUT /test_order/_mapping
{
"properties": {
"name": {
"type": "text",
"fields": {
"pinyin_ik_smart": {
"type": "text",
"analyzer": "pinyin_analyzer_ik_smart",
"fielddata": true
},
"pinyin_ik_max": {
"type": "text",
"analyzer": "pinyin_analyzer_ik_max",
"fielddata": true
},
"pinyin_keyword": {
"type": "text",
"analyzer": "pinyin_analyzer_keyword",
"fielddata": true
}
}
}
}
}
查看分词结果
POST /test_order/_analyze
{
"analyzer": "pinyin_analyzer_keyword",
"text": ["蔡蒸下"]
}
结果
{
"tokens" : [
{
"token" : "czx",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 0
}
]
}
查询语句
GET /test_order/_search
{
"_source": "name",
"sort": [
{
"name.pinyin_keyword": {
"order": "asc","mode": "min","missing": "_last"
}
}
]
}
查询部分结果
"hits" : [
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "33",
"_score" : null,
"_source" : {
"name" : "爱小国"
},
"sort" : [
"axg"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "34",
"_score" : null,
"_source" : {
"name" : "包不同"
},
"sort" : [
"bbt"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "11",
"_score" : null,
"_source" : {
"name" : "蔡蒸下"
},
"sort" : [
"czx"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "44",
"_score" : null,
"_source" : {
"name" : "走国"
},
"sort" : [
"zg"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "22",
"_score" : null,
"_source" : {
"name" : "周正国"
},
"sort" : [
"zzg"
]
}
]
实现方式二 直接设置分词器
setting设置
"analysis": {
"analyzer": {
"pinyin_analyzer": {
"tokenizer": "pinyin_tokenizer"
}
},
"tokenizer": {
"pinyin_tokenizer": {
"lowercase": "false",
"keep_original": "false",
"remove_duplicated_term": "false",
"keep_separate_first_letter": "false",
"keep_first_letter": "true",
"type": "pinyin",
"limit_first_letter_length": "16",
"keep_full_pinyin": "false"
}
}
}
mapping设置
"name": {
"type": "text",
"fields": {
"pinyin": {
"type": "text",
"analyzer": "pinyin_analyzer",
"tokenizer": "keyword",
"fielddata": true
}
}
}
查看分词结果
POST /test_order/_analyze
{
"analyzer": "pinyin_analyzer",
"text": ["蔡蒸下"]
}
结果
{
"tokens" : [
{
"token" : "czx",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 0
}
]
}
查询
GET /test_order/_search
{
"_source": "name",
"sort": [
{
"name.pinyin": {
"order": "asc","mode": "min","missing": "_last"
}
}
]
}
查询结果
"hits" : [
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "33",
"_score" : null,
"_source" : {
"name" : "爱小国"
},
"sort" : [
"axg"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "34",
"_score" : null,
"_source" : {
"name" : "包不同"
},
"sort" : [
"bbt"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "11",
"_score" : null,
"_source" : {
"name" : "蔡蒸下"
},
"sort" : [
"czx"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "44",
"_score" : null,
"_source" : {
"name" : "走国"
},
"sort" : [
"zg"
]
},
{
"_index" : "test_order",
"_type" : "_doc",
"_id" : "22",
"_score" : null,
"_source" : {
"name" : "周正国"
},
"sort" : [
"zzg"
]
}
]
四、二次打分排序
利用 es 中rescore 进行二次打分,实际当中可能会有这么个需求:按照 多个关键词对字段A 进行全文检索,结果显示按照评分高匹配度高到低 倒序显示 。 但是还有一个时间字段B,需要按照时间倒序。如果直接按照时间倒序,可能会出现 时间最新的数据 字段A的匹配度很低。 整体需求就是:在高匹配的基础上按照时间倒序。 也就是二次打分。
直接参考官网示例https://www.elastic.co/guide/en/elasticsearch/reference/7.11/filter-search-results.html#rescore
还有一种方式就是利用function_score 去调整打分
https://www.knowledgedict.com/tutorial/elasticsearch-function_score.html