ELK高级搜索（二）

South.return

已于 2023-09-15 14:35:42 修改

阅读量651

点赞数

分类专栏：中间件文章标签： elk

于 2023-08-26 15:51:48 首次发布

本文链接：https://blog.csdn.net/fhsbvs/article/details/132512345

版权

中间件专栏收录该内容

6 篇文章 0 订阅

订阅专栏

7．Java api 文档管理

7.1 es技术特点

1 es技术比较特殊，不像其他分布式、大数据，es代码层面很好写，难的是概念的理解。

2 es最重要的是他的rest api。跨语言的。在真实生产中，探查数据、分析数据，使用rest更方便。

7.2 获取数据

java api 文档 https://www.elastic.co/guide/en/elasticsearch/client/java-rest/7.3/java-rest-overview.html

low : 偏向底层。
high：高级封装。

1 导包

    <dependency>
        <groupId>org.elasticsearch.client</groupId>
        <artifactId>elasticsearch-rest-high-level-client</artifactId>
        <version>7.3.0</version>
        <exclusions>
            <exclusion>
                <groupId>org.elasticsearch</groupId>
                <artifactId>elasticsearch</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch</artifactId>
        <version>7.3.0</version>
    </dependency>

2 代码

    // 1获取连接客户端
    RestHighLevelClient client = new RestHighLevelClient(RestClient.builder(
            new HttpHost("localhost", 9200, "http")
    ));
    // 2构建请求
    GetRequest getRequest = new GetRequest("book", "1");
    // 3执行
    GetResponse getResponse = client.get(getRequest, RequestOptions.DEFAULT);
    // 4获取结果
    System.out.println(getResponse.getId());
    System.out.println(getResponse.getVersion());
    System.out.println(getResponse.getSourceAsString());

7.3 文档查询

1 导包

    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter</artifactId>
        <version>2.0.6.RELEASE</version>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-test</artifactId>
        <scope>test</scope>
        <version>2.0.6.RELEASE</version>
    </dependency>

2 配置 application.yml

spring:
  application:
    name: service-search
heima:
  elasticsearch:
    hostlist: 127.0.0.1:9200 #多个结点中间用逗号分隔

3 代码

/**
 * 配置类
 */
@Configuration
public class ElasticsearchConfig {

    @Value("${heima.elasticsearch.hostlist}")
    private String hostlist;

    @Bean(destroyMethod = "close")
    public RestHighLevelClient restHighLevelClient() {
        String[] split = hostlist.split(",");
        HttpHost[] httpHostsArray = new HttpHost[split.length];
        for (int i = 0; i < split.length; i++) {
            String item = split[i];
            httpHostsArray[i] = new HttpHost(item.split(":")[0], Integer.parseInt(item.split(":")[1]), "http");
        }
        return new RestHighLevelClient(RestClient.builder(httpHostsArray));
    }
}

/**
 * 测试类
 */
@SpringBootTest(classes = SearchApplication.class)
@RunWith(SpringRunner.class)
public class TestDocument {

    @Autowired
    RestHighLevelClient client;

    /**
     * 文档查询
     */
    @Test
    public void testGet() throws IOException {
        // 查询数据
        // GET test_posts/_doc/1

        // 1构建请求
        GetRequest getRequest = new GetRequest("test_posts", "1");
        //===========可选参数==========
//        String[] includes = new String[]{"user", "message"}; 想要的字段
//        String[] excludes = Strings.EMPTY_ARRAY;             不想要的字段
//        FetchSourceContext fetchSourceContext = new FetchSourceContext(true, includes, excludes);
//        getRequest.fetchSourceContext(fetchSourceContext);
        String[] includes = Strings.EMPTY_ARRAY;
        String[] excludes = new String[]{"user", "message"};
        FetchSourceContext fetchSourceContext = new FetchSourceContext(true, includes, excludes);
        getRequest.fetchSourceContext(fetchSourceContext);

        // 2执行
        //同步查询
        GetResponse getResponse = client.get(getRequest, RequestOptions.DEFAULT);
        //异步查询
//        ActionListener<GetResponse> listener=new ActionListener<GetResponse>() {
//            //成功时
//            public void onResponse(GetResponse getResponse) {
//                System.out.println(getResponse.getId());
//                System.out.println(getResponse.getVersion());
//                System.out.println(getResponse.getSourceAsString());
//            }
//            //失败时
//            public void onFailure(Exception e) {
//                e.printStackTrace();
//            }
//        };
//        client.getAsync(getRequest, RequestOptions.DEFAULT, listener);
//        try {
//            Thread.sleep(5000);
//        } catch (InterruptedException e) {
//            e.printStackTrace();
//        }

        // 3获取结果
        if (getResponse.isExists()) {
            System.out.println(getResponse.getId());
            System.out.println(getResponse.getVersion());
            //以string获取数据
            System.out.println(getResponse.getSourceAsString());
            //以Bytes获取数据
            System.out.println(getResponse.getSourceAsBytes());
            //以Map获取数据
            System.out.println(getResponse.getSourceAsMap());
        }
    }
}

7.4 文档新增

/**
 * 测试类
 */
@SpringBootTest(classes = SearchApplication.class)
@RunWith(SpringRunner.class)
public class TestDocument {

    @Autowired
    RestHighLevelClient client;

    /**
     * 文档新增
     */
    @Test
    public void testAdd() throws IOException {
        // 新增数据
//        PUT test_posts/_doc/3
//        {
//            "user": "tomas",
//                "postDate": "2023-09-14",
//                "message": "trying out es1"
//        }

        // 1构建请求
        IndexRequest request = new IndexRequest("test_posts");
        request.id("3");
        //===========构建文档数据===========
        //方法1
        String jsonString = "{\n" +
                "  \"user\":\"tomas\",\n" +
                "  \"postDate\":\"2023-09-14\",\n" +
                "  \"message\":\"trying out es1\"\n" +
                "}";
        request.source(jsonString, XContentType.JSON);
        //方法2
//        Map<String, Object> jsonMap = new HashMap<String, Object>();
//        jsonMap.put("user", "tomas");
//        jsonMap.put("postDate", "2023-09-14");
//        jsonMap.put("message", "trying out es1");
//        request.source(jsonMap);
        //方法3
//        XContentBuilder builder = XContentFactory.jsonBuilder();
//        builder.startObject();
//        {
//            builder.field("user", "tomas");
//            builder.field("message", "trying out es1");
//            builder.timeField("postDate", "2023-09-14");
//        }
//        builder.endObject();
//        request.source(builder);
        //方法4
//        request.source("user", "tomas",
//                "message", "trying out es1",
//                "postDate", "2023-09-14");

        //===========可选参数===========
        //设置超时时间
        request.timeout("1s");
        request.timeout(TimeValue.timeValueSeconds(1));
        //手动维护版本号
        request.version(2);
        request.versionType(VersionType.EXTERNAL);

        // 2执行
        //同步新增
        IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);
        //异步新增
//        ActionListener<IndexResponse> listener = new ActionListener<IndexResponse>() {
//            //成功时
//            public void onResponse(IndexResponse indexResponse) {
//                System.out.println(indexResponse.getIndex());
//                System.out.println(indexResponse.getId());
//                System.out.println(indexResponse.getResult());
//            }
//            //失败时
//            public void onFailure(Exception e) {
//
//            }
//        };
//        client.indexAsync(request, RequestOptions.DEFAULT, listener);
//        try {
//            Thread.sleep(5000);
//        } catch (InterruptedException e) {
//            e.printStackTrace();
//        }

        // 3获取结果
        System.out.println(indexResponse.getIndex());
        System.out.println(indexResponse.getId());

        if (indexResponse.getResult() == DocWriteResponse.Result.CREATED) {
            DocWriteResponse.Result result = indexResponse.getResult();
            System.out.println("CREATE" + result);
        } else if (indexResponse.getResult() == DocWriteResponse.Result.UPDATED) {
            DocWriteResponse.Result result = indexResponse.getResult();
            System.out.println("UPDATED" + result);
        }
//        "_shards" : {
//            "total" : 2,
//             "successful" : 1,
//             "failed" : 0
//        }
        ReplicationResponse.ShardInfo shardInfo = indexResponse.getShardInfo();
        if (shardInfo.getTotal() != shardInfo.getSuccessful()) {
            System.out.println("处理成功的分片数少于总分片！");
        }

        if (shardInfo.getFailed() > 0) {
            for (ReplicationResponse.ShardInfo.Failure failure : shardInfo.getFailures()) {
                //每一个错误的原因
                String reason = failure.reason();
                System.out.println(reason);
            }
        }
    }
}

7.5 文档修改

/**
 * 测试类
 */
@SpringBootTest(classes = SearchApplication.class)
@RunWith(SpringRunner.class)
public class TestDocument {

    @Autowired
    RestHighLevelClient client;

    /**
     * 文档更新
     */
    @Test
    public void testUpdate() throws IOException {
        // 修改数据
//        POST test_posts/_doc/3/_update
//        {
//            "doc": {
//                "user": "tomas Lee"
//            }
//        }

        // 1创建请求
        UpdateRequest request = new UpdateRequest("test_posts", "3");
        Map<String, Object> jsonMap = new HashMap<String, Object>();
        jsonMap.put("user", "tomas Lee");
        request.doc(jsonMap);
        //=========可选参数=========
        request.timeout("1s");
        //重试次数
        request.retryOnConflict(3);

        // 2执行
        //同步更新
        UpdateResponse updateResponse = client.update(request, RequestOptions.DEFAULT);
        //异步更新

        // 3获取结果
        System.out.println(updateResponse.getId());
        System.out.println(updateResponse.getIndex());
        //判断结果
        if (updateResponse.getResult() == DocWriteResponse.Result.CREATED) {
            DocWriteResponse.Result result = updateResponse.getResult();
            System.out.println("CREATED：" + result);
        } else if (updateResponse.getResult() == DocWriteResponse.Result.UPDATED) {
            DocWriteResponse.Result result = updateResponse.getResult();
            System.out.println("UPDATED：" + result);
        } else if (updateResponse.getResult() == DocWriteResponse.Result.DELETED) {
            DocWriteResponse.Result result = updateResponse.getResult();
            System.out.println("DELETED：" + result);
        } else if (updateResponse.getResult() == DocWriteResponse.Result.NOOP) {
            //没有操作
            DocWriteResponse.Result result = updateResponse.getResult();
            System.out.println("NOOP：" + result);
        }
    }
}

7.6 文档删除

/**
 * 测试类
 */
@SpringBootTest(classes = SearchApplication.class)
@RunWith(SpringRunner.class)
public class TestDocument {

    @Autowired
    RestHighLevelClient client;

    /**
     * 文档删除
     */
    @Test
    public void testDelete() throws IOException {
        // 删除数据
        // DELETE test_posts/_doc/3

        // 1创建请求
        DeleteRequest request = new DeleteRequest("test_posts", "3");
        //可选参数

        // 2执行
        DeleteResponse deleteResponse = client.delete(request, RequestOptions.DEFAULT);

        // 3获取结果
        System.out.println(deleteResponse.getId());
        System.out.println(deleteResponse.getIndex());
        DocWriteResponse.Result result = deleteResponse.getResult();
        System.out.println(result);
    }
}

7.7 文档bulk

/**
 * 测试类
 */
@SpringBootTest(classes = SearchApplication.class)
@RunWith(SpringRunner.class)
public class TestDocument {

    @Autowired
    RestHighLevelClient client;

    /**
     * 批量操作
     */
    @Test
    public void testBulk() throws IOException {
        // 1创建请求
        BulkRequest request = new BulkRequest();
        request.add(new IndexRequest("post").id("1").source(XContentType.JSON, "field", "1"));
        request.add(new IndexRequest("post").id("2").source(XContentType.JSON, "field", "2"));
//        request.add(new UpdateRequest("post", "1").doc(XContentType.JSON, "field", "3"));
//        request.add(new DeleteRequest("post").id("2"));

        // 2执行
        BulkResponse bulkResponse = client.bulk(request, RequestOptions.DEFAULT);

        // 3获取结果
        for (BulkItemResponse itemResponse : bulkResponse) {
            DocWriteResponse response = itemResponse.getResponse();
            switch (itemResponse.getOpType()) {
                case INDEX:
                    IndexResponse indexResponse = (IndexResponse) response;
                    System.out.println("INDEX:" + indexResponse.getResult());
                    break;
                case CREATE:
                    IndexResponse createResponse = (IndexResponse) response;
                    System.out.println("CREATE:" + createResponse.getResult());
                    break;
                case UPDATE:
                    UpdateResponse updateResponse = (UpdateResponse) response;
                    System.out.println("UPDATE:" + updateResponse.getResult());
                    break;
                case DELETE:
                    DeleteResponse deleteResponse = (DeleteResponse) response;
                    System.out.println("DELETE:" + deleteResponse.getResult());
                    break;
            }
        }
    }
}

8．图解es内部机制

8.1 es分布式基础

在这里插入图片描述

8.1.1 es对复杂分布式机制的透明隐藏特性

分布式机制：分布式数据存储及共享
分片机制：数据存储到哪个分片，副本数据写入
集群发现机制：cluster discovery。新启动es实例，自动加入集群
shard负载均衡：大量数据写入及查询，es会将数据平均分配
shard副本：新增副本数，分片重分配

8.1.2 Elasticsearch的垂直扩容与水平扩容

垂直扩容：使用更加强大的服务器替代老服务器。单机存储及运算能力有上线，且成本直线上升。如10t服务器1万，单个10T服务器可能20万。

水平扩容：采购更多服务器，加入集群。大数据。

8.1.3 增加或减少节点时的数据rebalance

新增或减少es实例时，es集群会将数据重新分配。

8.1.4 master节点

功能：

创建删除节点
创建删除索引

8.1.5 节点对等的分布式架构

节点对等，每个节点都能接收所有的请求
自动请求路由
响应收集

8.2 分片shard、副本replica

在这里插入图片描述

8.2.1 shard&replica机制

（1）每个index包含一个或多个shard

（2）每个shard都是一个最小工作单元，承载部分数据，lucene实例，完整的建立索引和处理请求的能力

（3）增减节点时，shard会自动在nodes中负载均衡

（4）primary shard和replica shard，每个document肯定只存在于某一个primary shard以及其对应的replica shard中，不可能存在于多个primary shard

（5）replica shard是primary shard的副本，负责容错，以及承担读请求负载

（6）primary shard的数量在创建索引的时候就固定了，replica shard的数量可以随时修改

（7）primary shard的默认数量是1，replica默认是1，默认共有2个shard，1个primary shard，1个replica shard

注意：es7以前primary shard的默认数量是5，replica默认是1，默认有10个shard，5个primary shard，5个replica shard

（8）primary shard不能和自己的replica shard放在同一个节点上（否则节点宕机，primary shard和副本都丢失，起不到容错的作用），但是可以和其他primary shard的replica shard放在同一个节点上

8.3 单node环境创建index

在这里插入图片描述

（1）单node环境下，创建一个index，有3个primary shard，3个replica shard
（2）集群status是yellow
（3）这个时候，只会将3个primary shard分配到仅有的一个node上去，另外3个replica shard是无法分配的
（4）集群可以正常工作，但是一旦出现节点宕机，数据全部丢失，而且集群不可用，无法承接任何请求

PUT /test_index1
{
   "settings" : {
      "number_of_shards" : 3,
      "number_of_replicas" : 1
   }
}

8.4 多node环境replica shard

在这里插入图片描述

（1）replica shard分配：3个primary shard，3个replica shard，1 node
（2）primary —> replica同步
（3）读请求：primary/replica

8.5 横向扩容

在这里插入图片描述

分片自动负载均衡，分片向空闲机器转移
每个节点存储更少分片，系统资源给与每个分片的资源更多，整体集群性能提高
扩容极限：节点数大于整体分片数，则必有空闲机器
超出扩容极限时，可以增加副本数，如设置副本数为2，总共3*3=9个分片。9台机器同时运行，存储和搜索性能更强，容错性更好
容错性：只要一个索引的所有主分片在，集群就就可以运行

8.6 es容错机制 master选举 replica容错数据恢复

在这里插入图片描述

以3分片，2副本数，3节点为例介绍。

master node宕机，自动master选举，集群为red
replica容错：新master将replica提升为primary shard，yellow
重启宕机node，master copy replica到该node，使用原有的shard并同步宕机后的修改，green

9．图解文档存储机制

9.1 数据路由

在这里插入图片描述

9.1.1 文档存储如何路由到相应分片

一个文档，最终会落在主分片的一个分片上，到底应该在哪一个分片？这就是数据路由。

9.1.2 路由算法

shard = hash(routing) % number_of_primary_shards

哈希值对主分片数取模。

举例：

对一个文档经行crud时，都会带一个路由值 routing number。默认为文档_id（可能是手动指定，也可能是自动生成）

存储1号文档，经过哈希计算，哈希值为2,此索引有3个主分片，那么计算2%3=2，就算出此文档在P2分片上。

决定一个document在哪个shard上，最重要的一个值就是routing值，默认是_id，也可以手动指定，相同的routing值，每次过来，从hash函数中，产出的hash值一定是相同的

无论hash值是几，无论是什么数字，对number_of_primary_shards求余数，结果一定是在0~number_of_primary_shards-1之间这个范围内的。0,1,2。

9.1.3 手动指定 routing number

PUT /test_index/_doc/15?routing=num
{
  "num": 0,
  "tags": []
}

场景：在程序中，架构师可以手动指定已有数据的一个属性为路由值，好处是可以定制一类文档数据存储到一个分片中。缺点是设计不好，会造成数据倾斜。

所以，不同文档尽量放到不同的索引中。剩下的事情交给es集群自己处理。

9.1.4 主分片数量不可变

涉及到以往数据的查询搜索，所以一旦建立索引，主分片数不可变。

9.2 文档增删改内部机制

在这里插入图片描述

增删改可以看做update，都是对数据的改动。一个改动请求发送到es集群，经历以下四个步骤：

（1）客户端选择一个node发送请求过去，这个node就是coordinating node（协调节点）

（2）coordinating node，对document进行路由，将请求转发给对应的node（有primary shard）

（3）实际的node上的primary shard处理请求，然后将数据同步到replica node

（4）coordinating node，如果发现primary node和所有replica node都搞定之后，就返回响应结果给客户端

9.3 文档查询内部机制

在这里插入图片描述

1、客户端发送请求到任意一个node，成为coordinate node

2、coordinate node对document进行路由，将请求转发到对应的node，此时会使用round-robin随机轮询算法，在primary shard以及其所有replica中随机选择一个，让读请求负载均衡

3、接收请求的node返回document给coordinate node

4、coordinate node返回document给客户端

5、特殊情况：document如果还在建立索引过程中，可能只有primary shard有，任何一个replica shard都没有，此时可能会导致无法读取到document，但是document完成索引建立之后，primary shard和replica shard就都有了

9.4 bulk api奇特的json格式

POST /_bulk
{"action": {"meta"}}\n
{"data"}\n
{"action": {"meta"}}\n
{"data"}\n

[
    {
        "action":{
            "method":"create"
        },
        "data":{
            "id":1,
            "field1":"java",
            "field1":"spring",
        }
    },
      {
        "action":{
            "method":"create"
        },
        "data":{
            "id":2,
            "field1":"java",
            "field1":"spring",
        }
    }       
]

1、bulk中的每个操作都可能要转发到不同的node的shard去执行

2、如果采用比较良好的json数组格式

允许任意的换行，整个可读性非常棒，读起来很爽，es拿到那种标准格式的json串以后，要按照下述流程去进行处理

（1）将json数组解析为JSONArray对象，这个时候，整个数据，就会在内存中出现一份一模一样的拷贝，一份数据是json文本，一份数据是JSONArray对象

（2）解析json数组里的每个json，对每个请求中的document进行路由

（3）为路由到同一个shard上的多个请求，创建一个请求数组。100请求中有10个是到P1

（4）将这个请求数组序列化

（5）将序列化后的请求数组发送到对应的节点上去

3、耗费更多内存，更多的jvm gc开销

我们之前提到过bulk size最佳大小的那个问题，一般建议说在几千条那样，然后大小在10MB左右，所以说，可怕的事情来了。假设说现在100个bulk请求发送到了一个节点上去，然后每个请求是10MB，100个请求，就是1000MB = 1GB，然后每个请求的json都copy一份为jsonarray对象，此时内存中的占用就会翻倍，就会占用2GB的内存，甚至还不止。因为弄成jsonarray之后，还可能会多搞一些其他的数据结构，2GB+的内存占用。

占用更多的内存可能就会积压其他请求的内存使用量，比如说最重要的搜索请求，分析请求，等等，此时就可能会导致其他请求的性能急速下降。

另外的话，占用内存更多，就会导致java虚拟机的垃圾回收次数更多，跟频繁，每次要回收的垃圾对象更多，耗费的时间更多，导致es的java虚拟机停止工作线程的时间更多。

4、现在的奇特格式

POST /_bulk
{ "delete": { "_index": "test_index",  "_id": "5" }} \n
{ "create": { "_index": "test_index",  "_id": "14" }}\n
{ "test_field": "test14" }\n
{ "update": { "_index": "test_index",  "_id": "2"} }\n
{ "doc" : {"test_field" : "bulk test"} }\n

（1）不用将其转换为json对象，不会出现内存中的相同数据的拷贝，直接按照换行符切割json

（2）对每两个一组的json，读取meta，进行document路由

（3）直接将对应的json发送到node上去

5、最大的优势在于，不需要将json数组解析为一个JSONArray对象，形成一份大数据的拷贝，浪费内存空间，尽可能地保证性能

10．Mapping映射入门

10.1 什么是mapping映射

概念：自动或手动为index中的_doc建立的一种数据结构和相关配置，简称为mapping映射。

插入几条数据，让es自动为我们建立一个索引

PUT /website/_doc/1
{
  "post_date": "2019-01-01",
  "title": "my first article",
  "content": "this is my first article in this website",
  "author_id": 11400
}

PUT /website/_doc/2
{
  "post_date": "2019-01-02",
  "title": "my second article",
  "content": "this is my second article in this website",
  "author_id": 11400
}
 
PUT /website/_doc/3
{
  "post_date": "2019-01-03",
  "title": "my third article",
  "content": "this is my third article in this website",
  "author_id": 11400
}

对比数据库建表语句

create table website(
     post_date date,
     title varchar(50),     
     content varchar(100),
     author_id int(11) 
 );

动态映射：dynamic mapping，自动为我们建立index，以及对应的mapping，mapping中包含了每个field对应的数据类型，以及如何分词等设置。

重点：也可以手动在创建数据之前，先创建index，以及对应的mapping

GET  /website/_mapping/
{
  "website" : {
    "mappings" : {
      "properties" : {
        "author_id" : {
          "type" : "long"
        },
        "content" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "post_date" : {
          "type" : "date"
        },
        "title" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        }
      }
    }
  }
}

尝试各种搜索

GET /website/_search?q=2019        0条结果             
GET /website/_search?q=2019-01-01           1条结果
GET /website/_search?q=post_date:2019-01-01     1条结果
GET /website/_search?q=post_date:2019          0 条结果

搜索结果为什么不一致，因为es自动建立mapping的时候，设置了不同的field不同的data type。不同的data type的分词、搜索等行为是不一样的。所以出现了_all field和post_date field的搜索表现完全不一样。

10.2 精确匹配与全文搜索

10.2.1 exact value 精确匹配

2019-01-01，exact value，搜索的时候，必须输入2019-01-01，才能搜索出来

如果你输入一个01，是搜索不出来的

select * from book where name= ‘java’

10.2.2 full text 全文检索

搜“笔记电脑”，笔记本电脑词条会不会出现。

select * from book where name like ‘%java%’

（1）缩写 vs. 全称：cn vs. china

（2）格式转化：like liked likes

（3）大小写：Tom vs tom

（4）同义词：like vs love

2019-01-01，2019 01 01，搜索2019，或者01，都可以搜索出来

china，搜索cn，也可以将china搜索出来

likes，搜索like，也可以将likes搜索出来

Tom，搜索tom，也可以将Tom搜索出来

like，搜索love，同义词，也可以将like搜索出来

就不是说单纯的只是匹配完整的一个值，而是可以对值进行拆分词语后（分词）进行匹配，也可以通过缩写、时态、大小写、同义词等进行匹配。深入 NPL，自然语义处理。

10.3 全文检索下倒排索引原理

doc1：I really liked my small dogs, and I think my mom also liked them.

doc2：He never liked any dogs, so I hope that my mom will not expect me to liked him.

分词，初步的倒排索引的建立

term	doc1	doc2
I	*	*
really	*
liked	*	*
my	*	*
small	*
dogs	*
and	*
think	*
mom	*	*
also	*
them	*
He		*
never		*
any		*
so		*
hope		*
that		*
will		*
not		*
expect		*
me		*
to		*
him		*

演示了一下倒排索引最简单的建立的一个过程

搜索

mother like little dog，不可能有任何结果

mother

little

dog

这不是我们想要的结果。同义词mom\mother在我们人类看来是一样。想进行标准化操作。

重建倒排索引

normalization正规化，建立倒排索引的时候，会执行一个操作，也就是说对拆分出的各个单词进行相应的处理，以提升后面搜索的时候能够搜索到相关联的文档的概率。时态的转换，单复数的转换，同义词的转换，大小写的转换

mom ―> mother

liked ―> like

small ―> little

dogs ―> dog

重新建立倒排索引，加入normalization，再次用mother liked little dog搜索，就可以搜索到了

term	doc1	doc2	normalization
I	*	*
really	*
like	*	*	liked ―> like
my	*	*
little	*		small ―> little
dog	*		dogs ―> dog
and	*
think	*
mother	*	*	mom ―> mother
also	*
them	*
He		*
never		*
any		*
so		*
hope		*
that		*
will		*
not		*
expect		*
me		*
to		*
him		*

重新搜索

搜索：mother liked little dog，

对搜索条件经行分词 normalization

mother

liked -> like

little

dog

doc1和doc2都会搜索出来

10.4 分词器 analyzer

10.4.1 什么是分词器 analyzer

作用：切分词语，normalization（提升recall召回率）

给你一段句子，然后将这段句子拆分成一个一个的单个的单词，同时对每个单词进行normalization（时态转换，单复数转换）

recall，召回率：搜索的时候，增加能够搜索到的结果的数量

analyzer 组成部分：

1、character filter：在一段文本进行分词之前，先进行预处理，比如说最常见的就是，过滤html标签（hello --> hello），& --> and（I&you --> I and you）

2、tokenizer：分词，hello you and me --> hello, you, and, me

3、token filter：lowercase，stop word，synonymom，dogs --> dog，liked --> like，Tom --> tom，a/the/an --> 干掉，mother --> mom，small --> little

stop word 停用词：了的呢。

一个分词器，很重要，将一段文本进行各种处理，最后处理好的结果才会拿去建立倒排索引。

10.4.2 内置分词器的介绍

例句：Set the shape to semi-transparent by calling set_trans(5)

standard analyzer标准分词器：set, the, shape, to, semi, transparent, by, calling, set_trans, 5（默认的是standard）

simple analyzer简单分词器：set, the, shape, to, semi, transparent, by, calling, set, trans

whitespace analyzer：Set, the, shape, to, semi-transparent, by, calling, set_trans(5)

language analyzer（特定的语言的分词器，比如说，english，英语分词器）：set, shape, semi, transpar, call, set_tran, 5

官方文档：

https://www.elastic.co/guide/en/elasticsearch/reference/7.4/analysis-analyzers.html

在这里插入图片描述

10.5 query string根据字段分词

10.5.1 query string分词

query string必须以和index建立时相同的analyzer进行分词

query string对exact value和full text的区别对待

如： date：exact value 精确匹配

text: full text 全文检索

10.5.2 测试分词器

GET /_analyze
{
  "analyzer": "standard",
  "text": "Text to analyze 80"
}

返回值：

{
  "tokens" : [
    {
      "token" : "text",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "<ALPHANUM>",
      "position" : 0
    },
    {
      "token" : "to",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "<ALPHANUM>",
      "position" : 1
    },
    {
      "token" : "analyze",
      "start_offset" : 8,
      "end_offset" : 15,
      "type" : "<ALPHANUM>",
      "position" : 2
    },
    {
      "token" : "80",
      "start_offset" : 16,
      "end_offset" : 18,
      "type" : "<NUM>",
      "position" : 3
    }
  ]
}

token 实际存储的term 关键字

position 在此词条在原文本中的位置

start_offset/end_offset字符在原始字符串中的位置

10.6 mapping回顾总结

（1）往es里面直接插入数据，es会自动建立索引，同时建立对应的mapping。(dynamic mapping)

（2）mapping中就自动定义了每个field的数据类型

（3）不同的数据类型（比如说text和date），可能有的是exact value，有的是full text

（4）exact value，在建立倒排索引的时候，分词的时候，是将整个值一起作为一个关键词建立到倒排索引中的；full text，会经历各种各样的处理，分词，normaliztion（时态转换，同义词转换，大小写转换），才会建立到倒排索引中

（5）同时呢，exact value和full text类型的field就决定了，在一个搜索过来的时候，对exact value field或者是full text field进行搜索的行为也是不一样的，会跟建立倒排索引的行为保持一致；比如说exact value搜索的时候，就是直接按照整个值进行匹配，full text query string，也会进行分词和normalization再去倒排索引中去搜索

（6）可以用es的dynamic mapping，让其自动建立mapping，包括自动设置数据类型；也可以提前手动创建index和tmapping，自己对各个field进行设置，包括数据类型，包括索引行为，包括分词器，等

10.7 mapping的核心数据类型以及dynamic mapping

10.7.1 核心的数据类型

string：text and keyword

byte，short，integer，long,float，double

boolean

date

详见：https://www.elastic.co/guide/en/elasticsearch/reference/7.3/mapping-types.html

下图是ES7.3核心的字段类型如下：

在这里插入图片描述

10.7.2 dynamic mapping 推测规则

true or false --> boolean

123 --> long

123.45 --> double

2019-01-01 --> date

“hello world” --> text/keywod

10.7.3 查看mapping

GET /index/_mapping/

10.8 手动管理mapping

10.8.1 查询所有索引的映射

GET /_mapping

10.8.2 创建映射 ！！

创建索引后，应该立即手动创建映射

PUT book/_mapping
{
	"properties": {
           "name": {
                  "type": "text"
            },
           "description": {
              "type": "text",
              "analyzer":"english",
              "search_analyzer":"english"
           },
           "pic":{
             "type":"text",
             "index":false
           },
           "studymodel":{
             "type":"text"
           }
    }
}

Text 文本类型

1）analyzer

通过analyzer属性指定分词器

上边指定了analyzer是指在索引和搜索都使用english，如果单独想定义搜索时使用的分词器则可以通过search_analyzer属性

2）index

index属性指定是否索引

默认为index=true，即要进行索引，只有进行索引才可以从索引库搜索到

但是也有一些内容不需要索引，比如：商品图片地址只被用来展示图片，不进行搜索图片，此时可以将index设置为false

删除索引，重新创建映射，将pic的index设置为false，尝试根据pic去搜索，结果搜索不到数据

3）store

是否在source之外存储，每个文档索引后会在 ES中保存一份原始文档，存放在"_source"中，一般情况下不需要设置store为true，因为在_source中已经有一份原始文档了

测试

PUT book/_mapping
{
		"properties": {
           "name": {
                  "type": "text"
            },
           "description": {
              "type": "text",
              "analyzer":"english",
              "search_analyzer":"english"
           },
           "pic":{
             "type":"text",
             "index":false
           },
           "studymodel":{
             "type":"text"
           }
    }
}

插入文档：

PUT /book/_doc/1
{
  "name":"Bootstrap开发框架",
  "description":"Bootstrap是由Twitter推出的一个前台页面开发框架，在行业之中使用较为广泛。此开发框架包含了大量的CSS、JS程序代码，可以帮助开发者（尤其是不擅长页面开发的程序人员）轻松的实现一个不受浏览器限制的精美界面效果。",
  "pic":"group1/M00/00/01/wKhlQFqO4MmAOP53AAAcwDwm6SU490.jpg",
  "studymodel":"201002"
}

GET book/_doc/3

GET /book/_search?q=name:开发

GET /book/_search?q=description:开发

GET /book/_search?q=pic:group1/M00/00/01/wKhlQFqO4MmAOP53AAAcwDwm6SU490.jpg

GET /book/_search?q=studymodel:201002

通过测试发现：name和description都支持全文检索，pic不可作为查询条件

keyword关键字字段

目前已经取代了"index": false。上边介绍的text文本字段在映射时要设置分词器，keyword字段为关键字字段，通常搜索keyword是按照整体搜索，所以创建keyword字段的索引时是不进行分词的，比如：邮政编码、手机号码、身份证等。keyword字段通常用于过虑、排序、聚合等。

date日期类型

日期类型不用设置分词器。

通常日期类型的字段用于排序。

format

通过format设置日期格式

例子：

下边的设置允许date字段存储年月日时分秒、年月日及毫秒三种格式。

{
   "properties": {
       "timestamp": {
         "type": "date",
         "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd"
       }
    }
}

插入文档：

Post book/_doc/3 
{
	"name": "spring开发基础",
	"description": "spring 在java领域非常流行，java程序员都在用。",
	"studymodel": "201001",
	"pic":"group1/M00/00/01/wKhlQFqO4MmAOP53AAAcwDwm6SU490.jpg",
	"timestamp":"2018-07-04 18:28:58"
}

数值类型

下边是ES支持的数值类型

在这里插入图片描述

1、尽量选择范围小的类型，提高搜索效率

2、对于浮点数尽量用比例因子，比如一个价格字段，单位为元，我们将比例因子设置为100这在ES中会按分存储，映射如下：

"price": {
    "type": "scaled_float",
    "scaling_factor": 100
},

由于比例因子为100，如果我们输入的价格是23.45则ES中会将23.45乘以100存储在ES中。

如果输入的价格是23.456，ES会将23.456乘以100再取一个接近原始值的数，得出2346。

使用比例因子的好处是整型比浮点型更易压缩，节省磁盘空间。

如果比例因子不适合，则从下表选择范围小的去用：

更新已有映射，并插入文档：

PUT book/_doc/3
{
	"name": "spring开发基础",
	"description": "spring 在java领域非常流行，java程序员都在用。",
	"studymodel": "201001",
	"pic":"group1/M00/00/01/wKhlQFqO4MmAOP53AAAcwDwm6SU490.jpg",
	"timestamp":"2018-07-04 18:28:58",
	"price":38.6
}

10.8.3 修改映射

只能创建index时手动建立mapping，或者新增field mapping，但是不能update field mapping。

因为已有数据按照映射早已分词存储好。如果修改，那这些存量数据怎么办。

新增一个字段mapping

PUT /book/_mapping/
{
  "properties" : {
    "new_field" : {
      "type": "text",
     "index": "false"
    }
  }
}

如果修改mapping，会报错

PUT /book/_mapping/
{
  "properties" : {
    "studymodel" : {
     "type": "keyword"
    }
  }
}

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "mapper [studymodel] of different type, current_type [text], merged_type [keyword]"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "mapper [studymodel] of different type, current_type [text], merged_type [keyword]"
  },
  "status": 400
}

10.8.4 删除映射

通过删除索引来删除映射

delete book

10.9 复杂数据类型

10.9.1 multivalue field

{ “tags”: [ “tag1”, “tag2” ]}

建立索引时与string是一样的，数据类型不能混

10.9.2 empty field

null，[]，[null]

10.9.3 object field

PUT /company/_doc/1
{
  "address": {
    "country": "china",
    "province": "guangdong",
    "city": "guangzhou"
  },
  "name": "jack",
  "age": 27,
  "join_date": "2019-01-01"
}

address：object类型

查询映射

GET /company/_mapping
{
  "company" : {
    "mappings" : {
      "properties" : {
        "address" : {
          "properties" : {
            "city" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword",
                  "ignore_above" : 256
                }
              }
            },
            "country" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword",
                  "ignore_above" : 256
                }
              }
            },
            "province" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword",
                  "ignore_above" : 256
                }
              }
            }
          }
        },
        "age" : {
          "type" : "long"
        },
        "join_date" : {
          "type" : "date"
        },
        "name" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        }
      }
    }
  }
}

object

{
  "address": {
    "country": "china",
    "province": "guangdong",
    "city": "guangzhou"
  },
  "name": "jack",
  "age": 27,
  "join_date": "2017-01-01"
}

底层存储格式

{
    "name":         [jack],
    "age":          [27],
    "join_date":    [2017-01-01],
    "address.country":    [china],
    "address.province":   [guangdong],
    "address.city":       [guangzhou]
}

对象数组：

{
    "authors": [
        { "age": 26, "name": "Jack White"},
        { "age": 55, "name": "Tom Jones"},
        { "age": 39, "name": "Kitty Smith"}
    ]
}

存储格式：

{
    "authors.age":    [26, 55, 39],
    "authors.name":   [jack, white, tom, jones, kitty, smith]
}

South.return

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

ELK高级搜索（二）

文章目录

7．Java api 文档管理

7.1 es技术特点

7.2 获取数据

7.3 文档查询

7.4 文档新增

7.5 文档修改

7.6 文档删除

7.7 文档bulk

8．图解es内部机制

8.1 es分布式基础

8.2 分片shard、副本replica

8.3 单node环境创建index

8.4 多node环境replica shard

8.5 横向扩容

8.6 es容错机制 master选举 replica容错 数据恢复

9．图解文档存储机制

9.1 数据路由

9.2 文档增删改内部机制

9.3 文档查询内部机制

9.4 bulk api奇特的json格式

10．Mapping映射入门

10.1 什么是mapping映射

10.2 精确匹配与全文搜索

10.3 全文检索下倒排索引原理

10.4 分词器 analyzer

10.5 query string根据字段分词

10.6 mapping回顾总结

10.7 mapping的核心数据类型以及dynamic mapping

10.8 手动管理mapping

10.9 复杂数据类型

8.6 es容错机制 master选举 replica容错数据恢复