Elasticsearch对数字检索——ngram

最新推荐文章于 2024-08-25 17:11:33 发布

weixin_43315211

最新推荐文章于 2024-08-25 17:11:33 发布

阅读量1.6k

点赞数

分类专栏： Elasticsearch 文章标签： Elasticsearch ngram

本文链接：https://blog.csdn.net/weixin_43315211/article/details/102971732

版权

Elasticsearch 专栏收录该内容

12 篇文章 1 订阅

订阅专栏

数字可能信息不全，需要对数字进行切分，所以选用 ngram 分词器进行分词

测试

POST _analyze
{
  "tokenizer": "ngram",
  "text":"123456"
}

{
  "tokens" : [
    {
      "token" : "1",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "12",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "2",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "23",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "3",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "34",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "4",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "45",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "5",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "56",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "6",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "word",
      "position" : 10
    }
  ]
}

创建mapping

PUT test
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "ngram",
          "min_gram": 3,
          "max_gram": 4,
          "token_chars": [
            "letter",
            "digit"
          ]
        }
      }
    }
  }
  , "mappings": {
    "properties": {
      "name":{
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
  }
}

POST test/_analyze
{
  "analyzer": "my_analyzer",
  "text":"渝A253DC"
}

{
  "tokens" : [
    {
      "token" : "渝A2",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "渝A25",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "A25",
      "start_offset" : 1,
      "end_offset" : 4,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "A253",
      "start_offset" : 1,
      "end_offset" : 5,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "253",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "253D",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "53D",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "53DC",
      "start_offset" : 3,
      "end_offset" : 7,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "3DC",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 8
    }
  ]
}

POST test/_analyze
{
  "analyzer": "my_analyzer",
  "text":"123456"
}

{
  "tokens" : [
    {
      "token" : "123",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "1234",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "234",
      "start_offset" : 1,
      "end_offset" : 4,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "2345",
      "start_offset" : 1,
      "end_offset" : 5,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "345",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "3456",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "456",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "word",
      "position" : 6
    }
  ]
}

错误提示：

在这里插入图片描述
The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [1] but was [7]. This limit can be set by changing the [index.max_ngram_diff] index level setting。
从ES 7.0 以上，需要从新对 index.max_ngram_diff 进行设置

重构mapping

PUT test
{
  "settings": {
    "index.max_ngram_diff":8,
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "ngram",
          "min_gram": 1,
          "max_gram": 8,
          "token_chars": [
            "letter",
            "digit"
          ]
        }
      }
    }
  }
  , "mappings": {
    "properties": {
      "name":{
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
  }
}

POST test/_analyze
{
  "analyzer": "my_analyzer",
  "text":"123456"
}

{
  "tokens" : [
    {
      "token" : "1",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "12",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "123",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "1234",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "12345",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "123456",
      "start_offset" : 0,
      "end_offset" : 6,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "2",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "23",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "234",
      "start_offset" : 1,
      "end_offset" : 4,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "2345",
      "start_offset" : 1,
      "end_offset" : 5,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "23456",
      "start_offset" : 1,
      "end_offset" : 6,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "3",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "word",
      "position" : 11
    },
    {
      "token" : "34",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "word",
      "position" : 12
    },
    {
      "token" : "345",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "word",
      "position" : 13
    },
    {
      "token" : "3456",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "word",
      "position" : 14
    },
    {
      "token" : "4",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "word",
      "position" : 15
    },
    {
      "token" : "45",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "word",
      "position" : 16
    },
    {
      "token" : "456",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "word",
      "position" : 17
    },
    {
      "token" : "5",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "word",
      "position" : 18
    },
    {
      "token" : "56",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "word",
      "position" : 19
    },
    {
      "token" : "6",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "word",
      "position" : 20
    }
  ]
}

延伸：edgeNGram

ngram 的简化版，单个分词，以首字母为起始位置，进行分词
效果如下

PUT test
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"
        }
      },
      "tokenizer": {
        "my_tokenizer": {
          "type": "edgeNGram",
          "min_gram": 1,
          "max_gram": 8,
          "token_chars": [
            "letter",
            "digit"
          ]
        }
      }
    }
  }
  , "mappings": {
    "properties": {
      "name":{
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
  }
}

POST test/_analyze
{
  "analyzer": "my_analyzer",
  "text":"123456"
}

{
  "tokens" : [
    {
      "token" : "1",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "12",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "123",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "1234",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "12345",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "123456",
      "start_offset" : 0,
      "end_offset" : 6,
      "type" : "word",
      "position" : 5
    }
  ]
}