5-Elasticsearch分词器

zx_1305769448

已于 2023-03-24 17:00:25 修改

阅读量299

点赞数 1

分类专栏： Elasticsearch 文章标签： elasticsearch

于 2023-03-10 10:41:49 首次发布

本文链接：https://blog.csdn.net/zx_1305769448/article/details/129438038

版权

Elasticsearch 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

Elasticsearch分词器

1、simple分词器

simple分词器是对字母文本进行分词拆分，并将分词后的内容转换成小写格式。

# 对指定内容根据simple分词器进行分词
POST _analyze
{
  "analyzer": "simple",
  "text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}

结果：

{
  "tokens" : [
    {
      "token" : "a",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "good",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "book",
      "start_offset" : 7,
      "end_offset" : 11,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "may",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "be",
      "start_offset" : 16,
      "end_offset" : 18,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "among",
      "start_offset" : 19,
      "end_offset" : 24,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "the",
      "start_offset" : 25,
      "end_offset" : 28,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "best",
      "start_offset" : 29,
      "end_offset" : 33,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "of",
      "start_offset" : 34,
      "end_offset" : 36,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "friends",
      "start_offset" : 37,
      "end_offset" : 44,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "it",
      "start_offset" : 46,
      "end_offset" : 48,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "is",
      "start_offset" : 49,
      "end_offset" : 51,
      "type" : "word",
      "position" : 11
    },
    {
      "token" : "the",
      "start_offset" : 52,
      "end_offset" : 55,
      "type" : "word",
      "position" : 12
    },
    {
      "token" : "same",
      "start_offset" : 56,
      "end_offset" : 60,
      "type" : "word",
      "position" : 13
    },
    {
      "token" : "today",
      "start_offset" : 61,
      "end_offset" : 66,
      "type" : "word",
      "position" : 14
    },
    {
      "token" : "that",
      "start_offset" : 67,
      "end_offset" : 71,
      "type" : "word",
      "position" : 15
    },
    {
      "token" : "it",
      "start_offset" : 72,
      "end_offset" : 74,
      "type" : "word",
      "position" : 16
    },
    {
      "token" : "always",
      "start_offset" : 75,
      "end_offset" : 81,
      "type" : "word",
      "position" : 17
    },
    {
      "token" : "was",
      "start_offset" : 82,
      "end_offset" : 85,
      "type" : "word",
      "position" : 18
    },
    {
      "token" : "and",
      "start_offset" : 87,
      "end_offset" : 90,
      "type" : "word",
      "position" : 19
    },
    {
      "token" : "it",
      "start_offset" : 91,
      "end_offset" : 93,
      "type" : "word",
      "position" : 20
    },
    {
      "token" : "will",
      "start_offset" : 94,
      "end_offset" : 98,
      "type" : "word",
      "position" : 21
    },
    {
      "token" : "never",
      "start_offset" : 99,
      "end_offset" : 104,
      "type" : "word",
      "position" : 22
    },
    {
      "token" : "change",
      "start_offset" : 105,
      "end_offset" : 111,
      "type" : "word",
      "position" : 23
    }
  ]
}

中文：

POST _analyze
{
  "analyzer": "simple",
  "text": "生活就是不断突破自我的过程。我们努力地向上，不仅是让世界看到我们，更是为了让自己看到世界。当我们一步一个脚印往前走时，就会发现，每一点进步，都在让我们的人生变得更辽阔。"
}

结果：

{
  "tokens" : [
    {
      "token" : "生活就是不断突破自我的过程",
      "start_offset" : 0,
      "end_offset" : 13,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "我们努力地向上",
      "start_offset" : 14,
      "end_offset" : 21,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "不仅是让世界看到我们",
      "start_offset" : 22,
      "end_offset" : 32,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "更是为了让自己看到世界",
      "start_offset" : 33,
      "end_offset" : 44,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "当我们一步一个脚印往前走时",
      "start_offset" : 45,
      "end_offset" : 58,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "就会发现",
      "start_offset" : 59,
      "end_offset" : 63,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "每一点进步",
      "start_offset" : 64,
      "end_offset" : 69,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "都在让我们的人生变得更辽阔",
      "start_offset" : 70,
      "end_offset" : 83,
      "type" : "word",
      "position" : 7
    }
  ]
}

2、simple_pattern分词器

根据正则表达式进行分词的分词器。

# 创建索引映射，定义字段内容的分词的正则表达式
PUT idx_custom_simplete_pattern
{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_pattern_analyzer": {
          "tokenizer": "custom_pattern_tokenizer"
        }
      },
      "tokenizer": {
        "custom_pattern_tokenizer": {
          "type": "simple_pattern",
          "pattern":"[0123456789]{3}"
        }
      }
    }
  }
}

POST idx_custom_simplete_pattern/_analyze
{
  "analyzer": "custom_pattern_analyzer",
  "text": "ads-132-754-235-23d-000"
}

结果：

{
  "tokens" : [
    {
      "token" : "132",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "754",
      "start_offset" : 8,
      "end_offset" : 11,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "235",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "000",
      "start_offset" : 20,
      "end_offset" : 23,
      "type" : "word",
      "position" : 3
    }
  ]
}

3、simple_pattern_split分词器

simple_pattern_split指定分词符合，比simple_pattern分词功能更有限，但是分词效率高。

默认分词符号是空字符串。


# 创建索引映射，指定字段内容分词匹配符号
PUT idx_custom_simple_pattern_split
{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_simple_pattern_split_analyzer": {
          "tokenizer": "custom_tokenizer"
        }
      },
      "tokenizer": {
        "custom_tokenizer": {
          "type": "simple_pattern_split",
          "pattern": "-"
        }
      }
    }
  }
}

POST idx_custom_simple_pattern_split/_analyze
{
  "analyzer": "custom_simple_pattern_split_analyzer",
  "text": "ads-132-754-235-23d-000"
}

结果：

{
  "tokens" : [
    {
      "token" : "ads",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "132",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "754",
      "start_offset" : 8,
      "end_offset" : 11,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "235",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "23d",
      "start_offset" : 16,
      "end_offset" : 19,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "000",
      "start_offset" : 20,
      "end_offset" : 23,
      "type" : "word",
      "position" : 5
    }
  ]
}

4、standard分词器

standard分词器是默认分词器，基于Unicode文本分割算法进行分词。

# 使用标准分词器
POST _analyze
{
  "analyzer": "standard",
  "text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}

结果：

{
  "tokens" : [
    {
      "token" : "a",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<ALPHANUM>",
      "position" : 0
    },
    {
      "token" : "good",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "<ALPHANUM>",
      "position" : 1
    },
    {
      "token" : "book",
      "start_offset" : 7,
      "end_offset" : 11,
      "type" : "<ALPHANUM>",
      "position" : 2
    },
    。。。。
    {
      "token" : "change",
      "start_offset" : 105,
      "end_offset" : 111,
      "type" : "<ALPHANUM>",
      "position" : 23
    }
  ]
}

POST _analyze
{
  "analyzer": "standard",
  "text": "生活就是不断突破自我的过程。我们努力地向上，不仅是让世界看到我们，更是为了让自己看到世界。当我们一步一个脚印往前走时，就会发现，每一点进步，都在让我们的人生变得更辽阔。"
}

结果：

{
  "tokens" : [
    {
      "token" : "生",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<IDEOGRAPHIC>",
      "position" : 0
    },
    {
      "token" : "活",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "就",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    。。。。
    {
      "token" : "辽",
      "start_offset" : 81,
      "end_offset" : 82,
      "type" : "<IDEOGRAPHIC>",
      "position" : 74
    },
    {
      "token" : "阔",
      "start_offset" : 82,
      "end_offset" : 83,
      "type" : "<IDEOGRAPHIC>",
      "position" : 75
    }
  ]
}

# 配置分词规则（==慎用！！！！==）
PUT idx_standard
{
  "settings": {
    "analysis": {
      "analyzer": {
        "english_analyzer":{
          "type":"standard",
          "max_token_length":5,
          "stopwords":"_english_"
        }
      }
    }
  }
}

# 使用标准分词器
POST idx_standard/_analyze
{
  "analyzer": "english_analyzer",
  "text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}

结果：

{
  "tokens" : [
    {
      "token" : "good",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "<ALPHANUM>",
      "position" : 1
    },
    {
      "token" : "book",
      "start_offset" : 7,
      "end_offset" : 11,
      "type" : "<ALPHANUM>",
      "position" : 2
    },
    {
      "token" : "may",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "<ALPHANUM>",
      "position" : 3
    },
    {
      "token" : "among",
      "start_offset" : 19,
      "end_offset" : 24,
      "type" : "<ALPHANUM>",
      "position" : 5
    },
    {
      "token" : "best",
      "start_offset" : 29,
      "end_offset" : 33,
      "type" : "<ALPHANUM>",
      "position" : 7
    },
    {
      "token" : "frien",
      "start_offset" : 37,
      "end_offset" : 42,
      "type" : "<ALPHANUM>",
      "position" : 9
    },
    {
      "token" : "ds",
      "start_offset" : 42,
      "end_offset" : 44,
      "type" : "<ALPHANUM>",
      "position" : 10
    },
    {
      "token" : "same",
      "start_offset" : 56,
      "end_offset" : 60,
      "type" : "<ALPHANUM>",
      "position" : 14
    },
    {
      "token" : "today",
      "start_offset" : 61,
      "end_offset" : 66,
      "type" : "<ALPHANUM>",
      "position" : 15
    },
    {
      "token" : "alway",
      "start_offset" : 75,
      "end_offset" : 80,
      "type" : "<ALPHANUM>",
      "position" : 18
    },
    {
      "token" : "s",
      "start_offset" : 80,
      "end_offset" : 81,
      "type" : "<ALPHANUM>",
      "position" : 19
    },
    {
      "token" : "never",
      "start_offset" : 99,
      "end_offset" : 104,
      "type" : "<ALPHANUM>",
      "position" : 24
    },
    {
      "token" : "chang",
      "start_offset" : 105,
      "end_offset" : 110,
      "type" : "<ALPHANUM>",
      "position" : 25
    },
    {
      "token" : "e",
      "start_offset" : 110,
      "end_offset" : 111,
      "type" : "<ALPHANUM>",
      "position" : 26
    }
  ]
}

5、keyword和text区别

text类型字段会进行分词处理，然后分词后的单词建立倒排索引。

支持全文检索，不是聚合计算和排序。

keyword类型字段不会分词处理，直接根据字段内容进行倒排索引。

不支持全文检索，支持聚合计算和排序。

6、IK分词器

下载

在Github官网搜索Elasticsearch-analysis-ik

下载地址：https://github.com/medcl/elasticsearch-analysis-ik/releases

安装

在elasticsearch安装目录的plugins目录中创建ik子目录

[esuser@db plugins]$ mkdir -p /home/elasticsearch/elasticsearch-7.13.2/plugins/ik
[esuser@db plugins]$ ll
总用量 0
drwxr-xr-x. 2 esuser esgroup 6 3月   8 16:45 ik
[esuser@db plugins]$

切换到ik

[esuser@db ~]# cd /home/elasticsearch/elasticsearch-7.13.2/plugins/ik
[esuser@db ik]#

上传elasticsearch-analysis-ik安装包，解压

unzip elasticsearch-analysis-ik-7.13.2.zip

重启ES

[esuser@db ~]# cd /home/elasticsearch/elasticsearch-7.13.2
[esuser@db ~]# ./bin/elasticsearch

测试

没有使用IK分词器

POST _analyze
{
  "text": "内心没有分别心，就是真正的苦行"
}

结果：

{
  "tokens" : [
    {
      "token" : "内",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<IDEOGRAPHIC>",
      "position" : 0
    },
    {
      "token" : "心",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "没",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    {
      "token" : "有",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "<IDEOGRAPHIC>",
      "position" : 3
    },
    {
      "token" : "分",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "<IDEOGRAPHIC>",
      "position" : 4
    },
    {
      "token" : "别",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "<IDEOGRAPHIC>",
      "position" : 5
    },
    {
      "token" : "心",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "<IDEOGRAPHIC>",
      "position" : 6
    },
    {
      "token" : "就",
      "start_offset" : 8,
      "end_offset" : 9,
      "type" : "<IDEOGRAPHIC>",
      "position" : 7
    },
    {
      "token" : "是",
      "start_offset" : 9,
      "end_offset" : 10,
      "type" : "<IDEOGRAPHIC>",
      "position" : 8
    },
    {
      "token" : "真",
      "start_offset" : 10,
      "end_offset" : 11,
      "type" : "<IDEOGRAPHIC>",
      "position" : 9
    },
    {
      "token" : "正",
      "start_offset" : 11,
      "end_offset" : 12,
      "type" : "<IDEOGRAPHIC>",
      "position" : 10
    },
    {
      "token" : "的",
      "start_offset" : 12,
      "end_offset" : 13,
      "type" : "<IDEOGRAPHIC>",
      "position" : 11
    },
    {
      "token" : "苦",
      "start_offset" : 13,
      "end_offset" : 14,
      "type" : "<IDEOGRAPHIC>",
      "position" : 12
    },
    {
      "token" : "行",
      "start_offset" : 14,
      "end_offset" : 15,
      "type" : "<IDEOGRAPHIC>",
      "position" : 13
    }
  ]
}

使用IK分词器

POST _analyze
{
  "analyzer": "ik_max_word", 
  "text": "内心没有分别心，就是真正的苦行"
}

结果：

{
  "tokens" : [
    {
      "token" : "内心",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "没有",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "分别",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "心",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "就是",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "真正",
      "start_offset" : 10,
      "end_offset" : 12,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "的",
      "start_offset" : 12,
      "end_offset" : 13,
      "type" : "CN_CHAR",
      "position" : 6
    },
    {
      "token" : "苦行",
      "start_offset" : 13,
      "end_offset" : 15,
      "type" : "CN_WORD",
      "position" : 7
    }
  ]
}

注意：

未安装IK分词器，使用ik_max_word会报错

安装IK分词器，未指定"analyzer": “ik_max_word”，使用默认分词器

IK分词模式

1、ik_max_word：对文本进行最细粒度的拆分

2、ik_smart：对文本进行最粗粒度的拆分

ik_max_word

POST _analyze
{
  "analyzer": "ik_max_word",
  "text": "我是中国人"
}

结果：

{
  "tokens" : [
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "是",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 1
    },
    {
      "token" : "中国人",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "中国",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "国人",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 4
    }
  ]
}

ik_smart

POST _analyze
{
  "analyzer": "ik_smart",
  "text": "我是中国人"
}

结果：

{
  "tokens" : [
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "是",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 1
    },
    {
      "token" : "中国人",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 2
    }
  ]
}

创建IK分词器的索引映射

text类型都使用ik_max_word分词模式


# 创建IK分词器索引映射，所有text类型都使用ik_max_word分词模式
PUT idx_user_ik_01
{
  "settings": {
    "analysis": {
      "analyzer": {
        "ik": {
          "tokenizer": "ik_max_word"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "user_id": {
        "type": "integer"
      },
      "user_name": {
        "type": "text",
        "fields": {
          "username": {
            "type": "keyword"
          }
        }
      },
      "age": {
        "type": "integer"
      },
      "address": {
        "type": "text"
      }
    }
  }
}

创建IK分词器索引映射,为每一个text指定分词模式

# 创建IK分词器索引映射,为每一个text指定分词模式
PUT idx_user_ik_02
{
  "mappings": {
    "properties": {
      "user_id": {
        "type": "integer"
      },
      "user_name": {
        "type": "text",
        "analyzer": "ik_smart",
        "search_analyzer": "ik_smart"
      },
      "age": {
        "type": "integer"
      },
      "address": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "standard"
      }
    }
  }
}


PUT idx_user_info_ik
{
  "mappings": {
    "properties": {
      "name": {
        "type": "keyword"
      },
      "address": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_smart"
      }
    }
  }
}

POST idx_user_info_ik/_doc
{
  "name":"曹操",
  "address":"魏国"
}

POST idx_user_info_ik/_doc
{
  "name":"诸葛亮",
  "address":"蜀国"
}

POST idx_user_info_ik/_doc
{
  "name":"周瑜",
  "address":"吴国"
}

POST idx_user_info_ik/_doc
{
  "name":"张飞",
  "address":"蜀国"
}

POST idx_user_info_ik/_search
{
  "query": {
    "match": {
      "address": {
        "query": "蜀国"
      }
    }
  }
}

查询结果：

{
  "took" : 19,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.6931471,
    "hits" : [
      {
        "_index" : "idx_user_info_ik",
        "_type" : "_doc",
        "_id" : "dQDywIYBnnKcIM3DPvhR",
        "_score" : 0.6931471,
        "_source" : {
          "name" : "诸葛亮",
          "address" : "蜀国"
        }
      },
      {
        "_index" : "idx_user_info_ik",
        "_type" : "_doc",
        "_id" : "dwDywIYBnnKcIM3DUvhX",
        "_score" : 0.6931471,
        "_source" : {
          "name" : "张飞",
          "address" : "蜀国"
        }
      }
    ]
  }
}