DSL

最新推荐文章于 2024-06-23 14:58:54 发布

阿花落知多少

最新推荐文章于 2024-06-23 14:58:54 发布

阅读量322

点赞数

本文链接：https://blog.csdn.net/qq_39940205/article/details/105629476

版权

# 创建索引
PUT /foodie-items/
{
  "settings":{
    "index":{
      "number_of_shards" : "1",
      "number_of_replicas" : "0"
    }
  }
}
# 查看索引
GET /_cat/indices?v
# 查看index/type中的数据
GET /forum/article/_search
# 查看index/type的映射
GET /forum/_mapping/article
# 通过_id查询
GET /forum/article/1
# 查看分词结果
GET /forum/_analyze
{
  "field": "articleID",
  "text": "XHDK-A-1293-#fJ3"
}
# _bulk批量增删改   mget批量查询
POST /forum/article/_bulk
{ "index": { "_id": 1 }}
{ "articleID" : "XHDK-A-1293-#fJ3", "userID" : 1, "hidden": false, "postDate": "2017-01-01" }
{ "index": { "_id": 2 }}
{ "articleID" : "KDKE-B-9947-#kL5", "userID" : 1, "hidden": false, "postDate": "2017-01-02" }
{ "index": { "_id": 3 }}
{ "articleID" : "JODL-X-1937-#pV7", "userID" : 2, "hidden": false, "postDate": "2017-01-01" }
{ "index": { "_id": 4 }}
{ "articleID" : "QQPX-R-3956-#aD8", "userID" : 2, "hidden": true, "postDate": "2017-01-02" }


# type=text，默认会设置两个field，一个是field本身，比如articleID，就是分词的；还有一个是field.keyword，默认不分词，会最多保留256个字符
# 搜索结果为空
GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "term": {
          "articleID": "KDKE-B-9947-#kL5"
        }
      }
    }
  }
}
# 搜索到一条结果
GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "term": {
          "articleID.keyword": "KDKE-B-9947-#kL5"
        }
      }
    }
  }
}

# 查询发帖日期为2017-01-01，或者帖子ID为XHDK-A-1293-#fJ3的帖子，同时要去发帖日期不为2017-01-02  select * from forum.article where (postDate = '2017-01-01' or articleID = 'XHDK-A-1293-#fJ3') and postDate != '2017-01-02';
GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "should": [
            {
              "term": {
                "postDate": "2017-01-01"
              }
            },
            {
              "term": {
                "articleID": "XHDK-A-1293-#fJ3"
              }
            }
          ],
          "must_not": {
            "term": {
              "postDate": "2017-01-02"
            }
          }
        }
      }
    }
  }
}


# 查询帖子ID为XHDK-A-1293-#fJ3，或者帖子ID为JODL-X-1937-#pV7且发帖日期为2017-01-01的帖子 select * from forum.article where articleID = 'XHDK-A-1293-#fJ3' or (articleID = 'JODL-X-1937-#pV7' and postDate = '2017-01-01')
GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "should": [
            {
              "term": {
                "articleID.keyword": "XHDK-A-1293-#fJ3"
              }
            },
            {
              "bool": {
                "must": [
                  {
                    "term": {
                      "articleID.keyword": "JODL-X-1937-#pV7"
                    }
                  },
                  {
                    "term": {
                      "postDate": "2017-01-01"
                    }
                  }
                ]
              }
            }
          ]
        }
      }
    }
  }
}

# term: {"field": "value"}
# terms: {"field": ["value1", "value2"]}

# 类似于sql中的in
# select * from tbl where col in ("value1", "value2")

# 为帖子数据批量增加tag字段
POST /forum/article/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"tag" : ["java", "hadoop"]} }
{ "update": { "_id": "2"} }
{ "doc" : {"tag" : ["java"]} }
{ "update": { "_id": "3"} }
{ "doc" : {"tag" : ["hadoop"]} }
{ "update": { "_id": "4"} }
{ "doc" : {"tag" : ["java", "elasticsearch"]} }

# 搜索帖子ID为KDKE-B-9947-#kL5或QQPX-R-3956-#aD8的帖子
GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "terms": {
          "articleID.keyword": [
            "KDKE-B-9947-#kL5",
            "QQPX-R-3956-#aD8"
          ]
        }
      }
    }
  }
}

# 查询tag中包含java或者hadoop的帖子
GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "terms": {
          "tag": [
            "java"
          ]
        }
      }
    }
  }
}

# 优化搜索结果，仅仅搜索tag只包含java的帖子
POST /forum/article/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"tag_cnt" : 2} }
{ "update": { "_id": "2"} }
{ "doc" : {"tag_cnt" : 1} }
{ "update": { "_id": "3"} }
{ "doc" : {"tag_cnt" : 1} }
{ "update": { "_id": "4"} }
{ "doc" : {"tag_cnt" : 2} }

GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "bool": {
          "must":[
              {"term":{"tag_cnt":1}},
              {"terms":{
                "tag":["java"]
              }}
            ]
        }
      }
    }
  }
}

# 为帖子数据增加浏览量的字段
POST /forum/article/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"view_cnt" : 30} }
{ "update": { "_id": "2"} }
{ "doc" : {"view_cnt" : 50} }
{ "update": { "_id": "3"} }
{ "doc" : {"view_cnt" : 100} }
{ "update": { "_id": "4"} }
{ "doc" : {"view_cnt" : 80} }

# 查询浏览量在30~60之间的帖子
GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "view_cnt": {
            "gte": 30,
            "lte": 60
          }
        }
      }
    }
  }
}

# 查询发帖日期在最近1个月的帖子
POST /forum/article/_bulk
{ "index": { "_id": 5 }}
{ "articleID" : "DHJK-B-1395-#Ky5", "userID" : 3, "hidden": false, "postDate": "2020-04-20", "tag": ["elasticsearch"], "tag_cnt": 1, "view_cnt": 10 }

GET /forum/article/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "range": {
          "postDate": {
            "gte": "now-30d"
          }
        }
      }
    }
  }
}

# 为帖子增加标题字段
POST /forum/article/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"title" : "this is java and elasticsearch blog"} }
{ "update": { "_id": "2"} }
{ "doc" : {"title" : "this is java blog"} }
{ "update": { "_id": "3"} }
{ "doc" : {"title" : "this is elasticsearch blog"} }
{ "update": { "_id": "4"} }
{ "doc" : {"title" : "this is java, elasticsearch, hadoop blog"} }
{ "update": { "_id": "5"} }
{ "doc" : {"title" : "this is spark blog"} }

# 搜索标题中包含java或elasticsearch的blog
# 这个跟之前的那个term query不一样了。不是搜索exact value，是进行full text全文检索。
# match query，是负责进行全文检索的。如果要检索的field，是not_analyzed类型的，那么match query也相当于term query。
GET /forum/article/_search
{
  "query": {
    "match": {
      "title": "java elasticsearch"
    }
  }
}

# 搜索标题中包含java和elasticsearch的blog
# 使用and匹配所有搜索关键字
GET /forum/article/_search
{
  "query": {
    "match": {
      "title": {
        "query": "java elasticsearch",
        "operator": "and"
      }
    }
  }
}

# 搜索包含java，elasticsearch，spark，hadoop，4个关键字中，至少3个的blog
# 指定一些关键字中，必须至少匹配其中的多少个关键字，才能作为结果返回
GET /forum/article/_search
{
  "query": {
    "match": {
      "title": {
        "query": "java elasticsearch spark hadoop",
        "minimum_should_match": "75%"
      }
    }
  }
}
GET /forum/article/_search
{
  "query": {
    "bool": {
      "should": [
        { "match": { "title": "java" }},
        { "match": { "title": "elasticsearch"   }},
        { "match": { "title": "hadoop"   }},
	{ "match": { "title": "spark"   }}
      ],
      "minimum_should_match": 3 
    }
  }
}
#1、全文检索的时候，进行多个值的检索，有两种做法，match query；should
#2、控制搜索结果精准度：and operator，minimum_should_match
# match query进行多值搜索的时候，es会在底层自动将这个match query转换为bool的语法

# 搜索条件的权重，boost，可以将某个搜索条件的权重加大，此时当匹配这个搜索条件和匹配另一个搜索条件的document，计算relevance score时，匹配权重更大的搜索条件的document，relevance score会更高，当然也就会优先被返回回来
# 默认情况下，搜索条件的权重都是1
GET /forum/article/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "title": "blog"
          }
        }
      ],
      "should": [
        {
          "match": {
            "title": {
              "query": "java"
            }
          }
        },
        {
          "match": {
            "title": {
              "query": "elasticsearch",
              "boost": 2
            }
          }
        }
      ]
    }
  }
}

# 多字段搜索
# 为帖子数据增加content字段
POST /forum/article/_bulk
{ "update": { "_id": "1"} }
{ "doc" : {"content" : "i like to write best elasticsearch article"} }
{ "update": { "_id": "2"} }
{ "doc" : {"content" : "i think java is the best programming language"} }
{ "update": { "_id": "3"} }
{ "doc" : {"content" : "i am only an elasticsearch beginner"} }
{ "update": { "_id": "4"} }
{ "doc" : {"content" : "elasticsearch and hadoop are all very good solution, i am a beginner"} }
{ "update": { "_id": "5"} }
{ "doc" : {"content" : "spark is best big data solution based on scala ,an programming language similar to java"} }

# 搜索title或content中包含java或solution的帖子
GET /forum/article/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title": "java solution"
          }
        },
        {
          "match": {
            "content": "java solution"
          }
        }
      ]
    }
  }
}

# best fields策略，就是说，搜索到的结果，应该是某一个field中匹配到了尽可能多的关键词，被排在前面；而不是尽可能多的field匹配到了少数的关键词，排在了前面
# dis_max语法，直接取多个query中，分数最高的那一个query的分数即可
GET /forum/article/_search
{
  "query": {
    "dis_max": {
      "queries": [
        {
          "match": {
            "title": "java solution"
          }
        },
        {
          "match": {
            "content": "java solution"
          }
        }
      ]
    }
  }
}

03_结构化搜索_filter执行原理深度剖析（bitset机制与caching机制）

（1）在倒排索引中查找搜索串，获取document list

（2）为每个在倒排索引中搜索到的结果，构建一个bitset，[0, 0, 0, 1, 0, 1]

（3）遍历每个过滤条件对应的bitset，优先从最稀疏的开始搜索，查找满足所有条件的document

（4）caching bitset，跟踪query，在最近256个query中超过一定次数的过滤条件，缓存其bitset。对于小segment（<1000，或<3%），不缓存bitset。

（5）filter大部分情况下来说，在query之前执行，先尽量过滤掉尽可能多的数据

（6）如果document有新增或修改，那么cached bitset会被自动更新

（7）以后只要是有相同的filter条件的，会直接来使用这个过滤条件对应的cached bitset