Elasticsearch学习分享（六）

CSAIWQYB

已于 2022-11-15 23:18:43 修改

阅读量1.8k

点赞数 1

分类专栏：数据工程文章标签： ELK Stack 大数据全文检索自然语言处理

于 2020-07-03 17:26:15 首次发布

本文链接：https://blog.csdn.net/weixin_44526949/article/details/106798656

版权

数据工程专栏收录该内容

21 篇文章 0 订阅

订阅专栏

本期学习ES DSL及其基本用法。

什么是ES DSL呢？全称Elasticsearch Query DSL。DSL又是什么呢？DSL(Domain Specific Language)，英译中的结果就是，领域特定语言。DSL指的是专注于某个应用程序领域的计算机语言，又译作领域专用语言。不同于其他计算机语言，顾名思义，这种语言只用在某些特定的领域。ES DSL是专门属于ES的查询语言，elasticsearch提供标准Restful风格的查询DSL来定义查询。可以将查询DSL看作由两种子句组成的查询的AST(Abstract Syntax Tree)：一种是lqc(leaf query clauses，叶查询语句)，可以理解为SQL里的where查询，在特定的字段中查找特定值，例如match，term或range查询，这些查询可以单独使用；第二种是cqc(Compound query clauses，复合查询语句)，用于组合多个查询。ES DSL的基本的API如下：

下面基于以上的method进行实战练习，全部操作在Kibana中完成。

1、启动es、es-head插件、Kibana

接下来，就可以愉快的写代码了。

为了方便大家操作，这里，我将所有代码写在一起，并且写了注释，帮助大家理解，所有的代码，均在Kibana中测试通过。这些代码中也包括上期的分词器(Elasticsearch学习分享（五）_CSAIWQYB的博客-CSDN博客)的使用。话不多说，直接上货：

# 一 index操作
# 新增索引
PUT /ai
PUT /nlp
PUT /ml
PUT /cv


# 获取索引
GET /ai,nlp,dl,ml,cv


# 判断索引是否存在
HEAD /nlp

HEAD /nba


# 关闭索引
POST /nlp/_close

# 删除索引
DELETE /ai
DELETE /nlp
DELETE /ml
DELETE /cv

# 二 mapping操作
# 新增mapping
PUT /ai/_mapping
{
  "properties":{
    "domain":{
      "type":"text"
    },
    "representative":{
      "type":"text"
    },
    "methods":{
      "type":"keyword"
    }
  }
}

# 查看创建的mapping
GET /ai/_mapping
GET /ai,nlp/_mapping
GET /nba/_mapping

# 修改mapping，比如新增字段
POST /ai/_mapping
{
  "properties": {
    "domain": {
      "type": "text"
    },
    "representative": {
      "type": "text"
    },
    "methods": {
      "type": "keyword"
    },
    "models":{
      "type":"text"
    },
    "contribution":{
      "type":"keyword"
    }
  }
}

# 强行修改字段类型，是不可以的
POST /ai/_mapping
{
  "properties": {
    "domain": {
      "type": "text"
    },
    "representative": {
      "type": "text"
    },
    "methods": {
      "type": "keyword"
    },
    "models":{
      "type":"keyword"
    },
    "contribution":{
      "type":"keyword"
    }
  }
}


# 三 document操作
# 新增文档
GET /ai/_doc/1
PUT /ai/_doc/1
{
  "domain":"NLP",
  "representative":"Yoshua Bengio",
  "methods":"DL",
  "models":"NNLM",
  "contribution":"Machine Translation"
}

PUT /ai/_doc/2
{
  "domian":"DL",
  "representative":"Geoffrey Hinton",
  "methods":"NN",
  "models":"RBM",
  "contribution":"Text Representation"
}

PUT /ai/_doc/3
{
  "domian":"CV",
  "representative":"Yann LeCun",
  "methods":"DL",
  "models":"CNN",
  "contribution":"Image Classification"
}
# 查看指定id的文档
GET /ai/_doc/1

# 查看多个文档(请求中未指定index)
GET /_mget
{
  "docs":[
    {
     "_index":"ai",
     "_type":"_doc",
     "_id":"1"
    },
    {
      "_index":"ai",
      "_type":"_doc",
      "_id":"2"
    },
    {
      "_index":"ai",
      "_type":"_doc",
      "_id":"3"
    }
  ]
}

# 查看多个文档(请求中指定index)
GET /ai/_mget
{
  "docs":[
    {
     "_type":"_doc",
     "_id":"1"
    },
    {
      "_type":"_doc",
      "_id":"2"
    },
    {
      "_type":"_doc",
      "_id":"3"
    }
  ]
}

# 修改文档
# 修改前，先查看文档
GET /ai/_doc/1

# 修改后
POST /ai/_update/1
{
  "doc":{
    "domain":"NLP",
    "representative":"Yoshua Bengio",
    "methods":"GNN",
    "models":"NKLM & QLSTM",
    "contribution":"Machine Translation"
  }
}

# 修改文档(增加字段)
POST /ai/_update/1
{
  "script": "ctx._source.honour=1"
}

# 修改文档(删除字段)
POST /ai/_update/1
{
  "script": "ctx._source.remove(\"honour\")"
}


                   # 四 搜索和查询(换个话题)

# 加载数据

# 首先将nba球员数据"player"文件放到ES的目录下，执行以下代码完成加载
# curl -X POST "localhost:9200/_bulk" -H 'Content-Type: application
# /json' --data-binary @player
# 注意这里的player是相对目录，已经在ES目录中。如果是其目录，一定
# 要指明路径

# 查看数据加载是否成功
GET /nba/_search

# 查询所有数据<==>SELECT * FROM nba
GET /nba/_search
{
  "query":{
    "match_all": {}
  }
}

# 删库
DELETE /nba

# 因为一些需求，重建索引

# 新建nba索引，并指定mapping
PUT /nba_small
{
  "mappings":{
    "properties":{
      "age":{
        "type":"integer"
      },
      "name":{
        "type":"text"
      },
      "team_name":{
        "type":"text"
      },
      "position":{
        "type":"text"
      },
      "play_year":{
        "type":"integer"
      },
      "jerse_no":{
        "type":"keyword"
      }
    }
  }
}

# 在新的index下创建documents
PUT /nba_small/_doc/1
{
  "age": 32,
  "name": "哈登",
  "team_name": "火箭",
  "position": "得分后卫",
  "player_year": 10,
  "jerse_no": "13"
}

PUT /nba_small/_doc/2
{
  "age":28,
  "name": "库里",
  "team_name": "勇士",
  "position": "控球后卫",
  "player_year": 10,
  "jerse_no": "30"
}

PUT /nba_small/_doc/3
{
  "age":25,
  "name": "詹姆斯",
  "team_name": "湖人",
  "position": "小前锋",
  "player_year": 15,
  "jerse_no": "23"
}

PUT /nba_small/_doc/4
{
  "age":28,
  "name": "姚明",
  "team_name": "火箭",
  "position": "小前锋",
  "player_year": 15,
  "jerse_no": "23"
}

# term查询
GET /nba/_search
{
  "query": {
    "terms": {
      "age": [
        23,
        26
      ]
    }
  }
}

# full text查询
# 查询所有信息----match_all
GET /nba/_search
{
  "query":{
    "match_all": {}
  }
}

# 匹配查询----match
GET /nba/_search
{
  "query":{
    "match": {
      "name": "知识库"
    }
  }
}

# 多匹配查询----multi_match
# 为了实验，更新下doc2
POST /nba/_update/2
{
  "doc": {
    "name": "库⾥",
    "team_name": "勇⼠",
    "position": "控球后卫",
    "play_year": 10,
    "jerse_no": "30",
    "title": "the best shooter"
  }
}
# 进行multi_match(多字段匹配查询)
GET /nba/_search
{
  "query": {
    "multi_match": {
      "query": "shooter",
      "fields": [
        "title",
        "name"
      ]
    }
  }
}

# 短语匹配
GET /nba/_search
{
  "query":{
    "match_phrase": {
      "title": "best"
    }
  }
}

GET /nba/_search
{
  "query":{
    "match_phrase":{
      "title":"best one"
    }
  }
}

# 短语匹配前缀查询----match_phrase_prefix

# 为了方便，更新文档3
POST /nba/_update/3
{
  "doc": {
    "name": "詹姆斯",
    "team_name": "湖⼈",
    "position": "⼩前锋",
    "play_year": 15,
    "jerse_no": "23",
    "title": "the best small forward"
  }
}

# 前缀查询
GET /nba/_search
{
  "query":{
    "match_phrase_prefix":{
      "title":"the best sm"
    }
  }
}

#                     ES高级查询
# 再探term查询
# term查询通常基于结构化数据，比如：number,date,keyword,而非text

# term query
GET /nba/_search
{
  "query": {
    "term": {
      "jerseyNo": "23"
    }
  }
}

# 分页功能，Kibana中默认界面只显示10条信息，为了显示更多信息，可以使用分页功能，使用from..size参数。
GET /nba/_search
{
  "query":{
    "match_phrase": {
      "displayName": "詹姆斯"
    }
  },
  "from":0,
  "size":600
}
# _source:多字段查找，即需要返回某几个字段的内容

# exit query：在特定的字段中查找非空值文档，即队名非空的球员
GET /nba/_search
{
  "_source":["displayName", "teamNameEn"],
  "query": {
    "exists": {
      "field": "teamNameEn"
    }
  },
  "from":0,
  "size":800
}
# 注意：
# 在Kibana中如果不加分页功能，默认只显示10条，
# 加了分页功能后，可以显示想要的条数


# prefix query：查找包含带有指定前缀term的文档
GET /nba/_search
{
  "query":{
    "prefix":{
      "teamNameEn":"Rock"
    }
  }
}

# wildcard query，支持通配符查询，*表示任意字符，?表示任意单个字符
GET /nba/_search
{
  "query":{
    "wildcard":{
      "teamNameEn":"Ro*s"
    }
  }
}

# regexp正则表达式查询
GET /nba/_search
{
  "query":{
    "regexp": {
      "teamNameEn": "Ro.*s"
    }
  }
}

# ids查询
GET /nba/_search
{
  "query": {
    "ids": {
      "values": [
        1,
        2
      ]
    }
  }
}

#                       范围查询
# 所谓的范围查询指的是查找指定字段在指定范围内(日期、数字、字符串)的
# 文档

# 查找在nba打了2年到10年内的球员
GET /nba/_search
{
  "query":{
    "range":{
      "playYear": {
        "gte": 2,
        "lte": 10
      }
    }
  }
}

# 查找1980年到1999年出生的球员
GET /nba/_search
{
  "query": {
    "range": {
      "birthDayStr": {
        "gte": "1980-01-01",
        "lte": "1999",
        "format": "yyyy-MM-dd||yyyy"
      }
    }
  }
}

#                      布尔查询
# 布尔查询有4种类型:must,filter,must_not,should
# must:必须出现在匹配文档中
# filter:必须出现在文档中，但是不打分
# must_not:不能出现在文档中，与must相反
# should:应该出现在文档中

# must:查找名字叫做James的球员
GET /nba/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "displayName": "james"
          }
        }
      ]
    }
  }
}

# filter:效果同must，但是不打分
GET /nba/_search
{
  "query": {
    "bool": {
      "filter": [
        {
          "match": {
            "displayNameEn": "james"
          }
        }
      ]
    }
  }
}

# must_not
GET /nba/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "displayNameEn": "james"
          }
        }
      ],
      "must_not": [
        {
          "term": {
            "teamConferenceEn": {
              "value": "Eastern"
            }
          }
        }
      ]
    }
  }
}

# should:查找名字叫james的打球时间应该在11到20年的西部球员
# 也就是说，即使匹配不到也返回，只是评分不同
GET /nba/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "displayNameEn": "james"
          }
        }
      ],
      "must_not": [
        {
          "term": {
            "teamConferenceEn": {
              "value": "Eastern"
            }
          }
        }
      ],
      "should": [
        {
          "range": {
            "playYear": {
              "gte": 11,
              "lte": 20
            }
          }
        }
      ]
    }
  }
}

# 设置minimum_should_match=1
GET /nba/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "displayNameEn": "james"
          }
        }
      ],
      "must_not": [
        {
          "term": {
            "teamConferenceEn": {
              "value": "Eastern"
            }
          }
        }
      ],
      "should": [
        {
          "range": {
            "playYear": {
              "gte": 11,
              "lte": 20
            }
          }
        }
      ],
      "minimum_should_match": 1
    }
  }
}

#                    排序查询

# 火箭队中按打球时间从大到小排序的球员
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "sort": [
    {
      "playYear": {
        "order": "desc"
      }
    }
  ]
}

# 火箭队中按打球时间从大到小，如果年龄相同则按照身高从高到低排序的球员
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "sort": [
    {
      "playYear": {
        "order": "asc"
      }
    }
  ]
}

#               ES 聚合分析
# 聚合分析分为指标聚合和桶聚合
# 指标聚合:对一个数据集求最大、最小、和、平均值等指标的计算的聚合
# 桶聚合:以分组的方式完成指标聚合，称之为桶聚合


# 1.指标聚合:max,min,sum,avg

# 求出火箭队球员的平均年龄
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "avgAge": {
      "avg": {
        "field": "age"
      }
    }
  },
  "size":0
}

# 求出火箭队中球员打球时间不为空的数量
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "countPlayerYear": {
      "value_count": {
        "field": "playYear"
      }
    }
  },
  "size": 0
}

# 查出火箭队有多少名球员
GET /nba/_count
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  }
}

# 使用cardinality去重计数
# 如查出火箭队中年龄不同的数量
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "countAgeDifferent": {
      "cardinality": {
        "field": "age"
      }
    }
  },
  "size": 0
}
#解释:总共有21人，其中年龄不同的共有13人

# stats可以计算出count,max,min,avg,sum这5个值
# 查出火箭队球员的年龄的stats
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "statsAge": {
      "stats": {
        "field": "age"
      }
    }
  },
  "size": 0
}
#解释count表示数量，共有21个人，最小年龄21岁，...

# Extended stats会计算出比stats更多的指标，比如平方和、方差、标准差
# 平均值加/k减两个标准差的区间
# 查出火箭队球员的年龄Extend stats
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "extendstatsAge": {
      "extended_stats": {
        "field": "age"
      }
    }
  },
  "size": 0
}

# Percentiles占比百分位对应的值统计，默认返回[1,5,25,50,75,95,99]分
# 位上的值
# 查出火箭队球员的年龄占比(系统默认)
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "percentileAge": {
      "percentiles": {
        "field": "age",
        "percents": [
          1,
          5,
          25,
          50,
          75,
          95,
          99
        ]
      }
    }
  },
  "size": 0
}

# 查出火箭队球员的年龄占比(自定义百分占比)
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "percentileAge": {
      "percentiles": {
        "field": "age",
        "percents": [
          25,
          50,
          75,
          100
        ]
      }
    }
  },
  "size": 0
}
# 解释：结果表示年龄<=22.75的占25%，<=25的占50%，<=30.25的占75%，...


# 2.桶聚合(MySQL中的Group by)

# terms aggregation 根据字段项分组聚合
# 火箭队根据年龄分组
GET /nba/_search
{
  "query":{
    "match":{
      "teamNameEn":"Rockets"
    }
  },
  "aggs":{
    "aggAge":{
      "terms":{
        "field":"age",
        "size":10
      }
    }
  },
  "size":0
}
# 解释:内部的size表示要分的桶数，这里设置为10个桶，其原理是先进行分组
# ，然后根据组数，按照降序进行输出，如果满足指定的桶数会全部返回，
# 如果指定的参数少于分的组数，会默认输出指定的个数，这些桶中的数量
# 按照递减的数量输出
# order分组聚合排序
# 火箭队根据年龄分组，分组的信息通过年龄从大到小排序(通过指定字段)
GET /nba/_search
{
  "query":{
    "match":{
      "teamNameEn":"Rockets"
    }
  },
  "aggs":{
    "aggAge":{
      "terms":{
        "field":"age",
        "size":10,
        "order": {
          "_key": "desc"
        }
      }
    }
  },
  "size":0
}
# 解释:这里的_key指的是年龄

# ⽕箭队根据年龄进⾏分组，分组信息通过⽂档数从⼤到⼩排序 (通过⽂档数)
GET /nba/_search
{
  "query": {
    "match": {
      "teamNameEn": "Rockets"
    }
  },
  "aggs": {
    "aggsAge": {
      "terms": {
        "field": "age",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    }
  },
  "size": 0
}
# 解释:这里的_count表示按文档计数

# 范围分组聚合
# NBA球员年龄按照<20,20-35,>35分组
GET /nba/_search
{
  "aggs": {
    "ageRange": {
      "range": {
        "field": "age",
        "ranges": [
          {
            "to": 20,
            "key": "A"
          },
          {
            "from": 20,
            "to": 35,
            "key": "B"
          },
          {
            "from": 35,
            "key": "C"
          }
        ]
      }
    }
  },
  "size": 0
}


#                分词

# standard analyzer
GET _analyze
{
  "analyzer": "standard",
  "text":"Curry is the best 3-points shooter"
}


# whitespace analyzer
GET _analyze
{
  "analyzer": "whitespace",
  "text":"Curry is the best 3-points shooter"
}

# stop analyzer
GET _analyze
{
  "analyzer": "stop",
  "text":"Curry is the best 3-points shooter"
}

# language analyzer
GET _analyze
{
  "analyzer": "english",
  "text":"Curry is comming"
}

# chinese tokenizer
# 默认的分词器的弊端
GET _analyze
{
  "analyzer": "standard",
  "text":"姚明进入nba名人堂"
}

# 使用ik分词器,ik_smart模式
GET _analyze
{
  "analyzer": "ik_smart",
  "text":"姚明进入nba名人堂"
}

# 使用ik分词器,ik_max_word模式
GET _analyze
{
  "analyzer": "ik_max_word",
  "text":"姚明进入nba名人堂"
}

掌握这些内容，那么ES DSL语言也算入门了，后续要想进阶学习，也会变得更加容易，之后如果有空，我会继续分享我的一些学习心得，学习与分享是为了让大家少走弯路，也是我写博客的动力所在，希望大家多多支持。

CSAIWQYB

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
打赏
0
评论
Elasticsearch学习分享（六）

本期学习ES DSL及其基本用法。什么是ES DSL呢？全称Elasticsearch Query DSL。DSL又是什么呢？DSL(Domain Specific Language)，英译中的结果就是，领域特定语言。DSL指的是专注于某个应用程序领域的计算机语言，又译作领域专用语言。不同于其他计算机语言，顾名思义，这种语言只用在某些特定的领域。ES DSL是专门属于ES的查询语言，elasticsearch提供标准Restful风格的查询DSL来定义查询。可以将查询DS...
复制链接

扫一扫