5-Elasticsearch分词器

Elasticsearch分词器

1、simple分词器

simple分词器是对字母文本进行分词拆分,并将分词后的内容转换成小写格式。

# 对指定内容根据simple分词器进行分词
POST _analyze
{
  "analyzer": "simple",
  "text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}

结果:

{
  "tokens" : [
    {
      "token" : "a",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "good",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "book",
      "start_offset" : 7,
      "end_offset" : 11,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "may",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "be",
      "start_offset" : 16,
      "end_offset" : 18,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "among",
      "start_offset" : 19,
      "end_offset" : 24,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "the",
      "start_offset" : 25,
      "end_offset" : 28,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "best",
      "start_offset" : 29,
      "end_offset" : 33,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "of",
      "start_offset" : 34,
      "end_offset" : 36,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "friends",
      "start_offset" : 37,
      "end_offset" : 44,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "it",
      "start_offset" : 46,
      "end_offset" : 48,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "is",
      "start_offset" : 49,
      "end_offset" : 51,
      "type" : "word",
      "position" : 11
    },
    {
      "token" : "the",
      "start_offset" : 52,
      "end_offset" : 55,
      "type" : "word",
      "position" : 12
    },
    {
      "token" : "same",
      "start_offset" : 56,
      "end_offset" : 60,
      "type" : "word",
      "position" : 13
    },
    {
      "token" : "today",
      "start_offset" : 61,
      "end_offset" : 66,
      "type" : "word",
      "position" : 14
    },
    {
      "token" : "that",
      "start_offset" : 67,
      "end_offset" : 71,
      "type" : "word",
      "position" : 15
    },
    {
      "token" : "it",
      "start_offset" : 72,
      "end_offset" : 74,
      "type" : "word",
      "position" : 16
    },
    {
      "token" : "always",
      "start_offset" : 75,
      "end_offset" : 81,
      "type" : "word",
      "position" : 17
    },
    {
      "token" : "was",
      "start_offset" : 82,
      "end_offset" : 85,
      "type" : "word",
      "position" : 18
    },
    {
      "token" : "and",
      "start_offset" : 87,
      "end_offset" : 90,
      "type" : "word",
      "position" : 19
    },
    {
      "token" : "it",
      "start_offset" : 91,
      "end_offset" : 93,
      "type" : "word",
      "position" : 20
    },
    {
      "token" : "will",
      "start_offset" : 94,
      "end_offset" : 98,
      "type" : "word",
      "position" : 21
    },
    {
      "token" : "never",
      "start_offset" : 99,
      "end_offset" : 104,
      "type" : "word",
      "position" : 22
    },
    {
      "token" : "change",
      "start_offset" : 105,
      "end_offset" : 111,
      "type" : "word",
      "position" : 23
    }
  ]
}

中文:

POST _analyze
{
  "analyzer": "simple",
  "text": "生活就是不断突破自我的过程。我们努力地向上,不仅是让世界看到我们,更是为了让自己看到世界。当我们一步一个脚印往前走时,就会发现,每一点进步,都在让我们的人生变得更辽阔。"
}

结果:

{
  "tokens" : [
    {
      "token" : "生活就是不断突破自我的过程",
      "start_offset" : 0,
      "end_offset" : 13,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "我们努力地向上",
      "start_offset" : 14,
      "end_offset" : 21,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "不仅是让世界看到我们",
      "start_offset" : 22,
      "end_offset" : 32,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "更是为了让自己看到世界",
      "start_offset" : 33,
      "end_offset" : 44,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "当我们一步一个脚印往前走时",
      "start_offset" : 45,
      "end_offset" : 58,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "就会发现",
      "start_offset" : 59,
      "end_offset" : 63,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "每一点进步",
      "start_offset" : 64,
      "end_offset" : 69,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "都在让我们的人生变得更辽阔",
      "start_offset" : 70,
      "end_offset" : 83,
      "type" : "word",
      "position" : 7
    }
  ]
}
2、simple_pattern分词器

根据正则表达式进行分词的分词器。

# 创建索引映射,定义字段内容的分词的正则表达式
PUT idx_custom_simplete_pattern
{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_pattern_analyzer": {
          "tokenizer": "custom_pattern_tokenizer"
        }
      },
      "tokenizer": {
        "custom_pattern_tokenizer": {
          "type": "simple_pattern",
          "pattern":"[0123456789]{3}"
        }
      }
    }
  }
}

POST idx_custom_simplete_pattern/_analyze
{
  "analyzer": "custom_pattern_analyzer",
  "text": "ads-132-754-235-23d-000"
}

结果:

{
  "tokens" : [
    {
      "token" : "132",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "754",
      "start_offset" : 8,
      "end_offset" : 11,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "235",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "000",
      "start_offset" : 20,
      "end_offset" : 23,
      "type" : "word",
      "position" : 3
    }
  ]
}
3、simple_pattern_split分词器

simple_pattern_split指定分词符合,比simple_pattern分词功能更有限,但是分词效率高。

默认分词符号是空字符串。


# 创建索引映射,指定字段内容分词匹配符号
PUT idx_custom_simple_pattern_split
{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_simple_pattern_split_analyzer": {
          "tokenizer": "custom_tokenizer"
        }
      },
      "tokenizer": {
        "custom_tokenizer": {
          "type": "simple_pattern_split",
          "pattern": "-"
        }
      }
    }
  }
}

POST idx_custom_simple_pattern_split/_analyze
{
  "analyzer": "custom_simple_pattern_split_analyzer",
  "text": "ads-132-754-235-23d-000"
}

结果:

{
  "tokens" : [
    {
      "token" : "ads",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "132",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "754",
      "start_offset" : 8,
      "end_offset" : 11,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "235",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "23d",
      "start_offset" : 16,
      "end_offset" : 19,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "000",
      "start_offset" : 20,
      "end_offset" : 23,
      "type" : "word",
      "position" : 5
    }
  ]
}
4、standard分词器

standard分词器是默认分词器,基于Unicode文本分割算法进行分词。

# 使用标准分词器
POST _analyze
{
  "analyzer": "standard",
  "text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}

结果:

{
  "tokens" : [
    {
      "token" : "a",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<ALPHANUM>",
      "position" : 0
    },
    {
      "token" : "good",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "<ALPHANUM>",
      "position" : 1
    },
    {
      "token" : "book",
      "start_offset" : 7,
      "end_offset" : 11,
      "type" : "<ALPHANUM>",
      "position" : 2
    },
    。。。。
    {
      "token" : "change",
      "start_offset" : 105,
      "end_offset" : 111,
      "type" : "<ALPHANUM>",
      "position" : 23
    }
  ]
}
POST _analyze
{
  "analyzer": "standard",
  "text": "生活就是不断突破自我的过程。我们努力地向上,不仅是让世界看到我们,更是为了让自己看到世界。当我们一步一个脚印往前走时,就会发现,每一点进步,都在让我们的人生变得更辽阔。"
}

结果:

{
  "tokens" : [
    {
      "token" : "生",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<IDEOGRAPHIC>",
      "position" : 0
    },
    {
      "token" : "活",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "就",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    。。。。
    {
      "token" : "辽",
      "start_offset" : 81,
      "end_offset" : 82,
      "type" : "<IDEOGRAPHIC>",
      "position" : 74
    },
    {
      "token" : "阔",
      "start_offset" : 82,
      "end_offset" : 83,
      "type" : "<IDEOGRAPHIC>",
      "position" : 75
    }
  ]
}
# 配置分词规则(==慎用!!!!==PUT idx_standard
{
  "settings": {
    "analysis": {
      "analyzer": {
        "english_analyzer":{
          "type":"standard",
          "max_token_length":5,
          "stopwords":"_english_"
        }
      }
    }
  }
}

# 使用标准分词器
POST idx_standard/_analyze
{
  "analyzer": "english_analyzer",
  "text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}

结果:

{
  "tokens" : [
    {
      "token" : "good",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "<ALPHANUM>",
      "position" : 1
    },
    {
      "token" : "book",
      "start_offset" : 7,
      "end_offset" : 11,
      "type" : "<ALPHANUM>",
      "position" : 2
    },
    {
      "token" : "may",
      "start_offset" : 12,
      "end_offset" : 15,
      "type" : "<ALPHANUM>",
      "position" : 3
    },
    {
      "token" : "among",
      "start_offset" : 19,
      "end_offset" : 24,
      "type" : "<ALPHANUM>",
      "position" : 5
    },
    {
      "token" : "best",
      "start_offset" : 29,
      "end_offset" : 33,
      "type" : "<ALPHANUM>",
      "position" : 7
    },
    {
      "token" : "frien",
      "start_offset" : 37,
      "end_offset" : 42,
      "type" : "<ALPHANUM>",
      "position" : 9
    },
    {
      "token" : "ds",
      "start_offset" : 42,
      "end_offset" : 44,
      "type" : "<ALPHANUM>",
      "position" : 10
    },
    {
      "token" : "same",
      "start_offset" : 56,
      "end_offset" : 60,
      "type" : "<ALPHANUM>",
      "position" : 14
    },
    {
      "token" : "today",
      "start_offset" : 61,
      "end_offset" : 66,
      "type" : "<ALPHANUM>",
      "position" : 15
    },
    {
      "token" : "alway",
      "start_offset" : 75,
      "end_offset" : 80,
      "type" : "<ALPHANUM>",
      "position" : 18
    },
    {
      "token" : "s",
      "start_offset" : 80,
      "end_offset" : 81,
      "type" : "<ALPHANUM>",
      "position" : 19
    },
    {
      "token" : "never",
      "start_offset" : 99,
      "end_offset" : 104,
      "type" : "<ALPHANUM>",
      "position" : 24
    },
    {
      "token" : "chang",
      "start_offset" : 105,
      "end_offset" : 110,
      "type" : "<ALPHANUM>",
      "position" : 25
    },
    {
      "token" : "e",
      "start_offset" : 110,
      "end_offset" : 111,
      "type" : "<ALPHANUM>",
      "position" : 26
    }
  ]
}
5、keyword和text区别

text类型字段会进行分词处理,然后分词后的单词建立倒排索引。

​ 支持全文检索,不是聚合计算和排序。

keyword类型字段不会分词处理,直接根据字段内容进行倒排索引。

​ 不支持全文检索,支持聚合计算和排序。

6、IK分词器
下载

在Github官网搜索Elasticsearch-analysis-ik

下载地址:https://github.com/medcl/elasticsearch-analysis-ik/releases

安装

在elasticsearch安装目录的plugins目录中创建ik子目录

[esuser@db plugins]$ mkdir -p /home/elasticsearch/elasticsearch-7.13.2/plugins/ik
[esuser@db plugins]$ ll
总用量 0
drwxr-xr-x. 2 esuser esgroup 6 3月   8 16:45 ik
[esuser@db plugins]$ 

切换到ik

[esuser@db ~]# cd /home/elasticsearch/elasticsearch-7.13.2/plugins/ik
[esuser@db ik]#

上传elasticsearch-analysis-ik安装包,解压

unzip elasticsearch-analysis-ik-7.13.2.zip

重启ES

[esuser@db ~]# cd /home/elasticsearch/elasticsearch-7.13.2
[esuser@db ~]# ./bin/elasticsearch
测试

没有使用IK分词器

POST _analyze
{
  "text": "内心没有分别心,就是真正的苦行"
}

结果:

{
  "tokens" : [
    {
      "token" : "内",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<IDEOGRAPHIC>",
      "position" : 0
    },
    {
      "token" : "心",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    {
      "token" : "没",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "<IDEOGRAPHIC>",
      "position" : 2
    },
    {
      "token" : "有",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "<IDEOGRAPHIC>",
      "position" : 3
    },
    {
      "token" : "分",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "<IDEOGRAPHIC>",
      "position" : 4
    },
    {
      "token" : "别",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "<IDEOGRAPHIC>",
      "position" : 5
    },
    {
      "token" : "心",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "<IDEOGRAPHIC>",
      "position" : 6
    },
    {
      "token" : "就",
      "start_offset" : 8,
      "end_offset" : 9,
      "type" : "<IDEOGRAPHIC>",
      "position" : 7
    },
    {
      "token" : "是",
      "start_offset" : 9,
      "end_offset" : 10,
      "type" : "<IDEOGRAPHIC>",
      "position" : 8
    },
    {
      "token" : "真",
      "start_offset" : 10,
      "end_offset" : 11,
      "type" : "<IDEOGRAPHIC>",
      "position" : 9
    },
    {
      "token" : "正",
      "start_offset" : 11,
      "end_offset" : 12,
      "type" : "<IDEOGRAPHIC>",
      "position" : 10
    },
    {
      "token" : "的",
      "start_offset" : 12,
      "end_offset" : 13,
      "type" : "<IDEOGRAPHIC>",
      "position" : 11
    },
    {
      "token" : "苦",
      "start_offset" : 13,
      "end_offset" : 14,
      "type" : "<IDEOGRAPHIC>",
      "position" : 12
    },
    {
      "token" : "行",
      "start_offset" : 14,
      "end_offset" : 15,
      "type" : "<IDEOGRAPHIC>",
      "position" : 13
    }
  ]
}

使用IK分词器

POST _analyze
{
  "analyzer": "ik_max_word", 
  "text": "内心没有分别心,就是真正的苦行"
}

结果:

{
  "tokens" : [
    {
      "token" : "内心",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "没有",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "分别",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "心",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "就是",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "真正",
      "start_offset" : 10,
      "end_offset" : 12,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "的",
      "start_offset" : 12,
      "end_offset" : 13,
      "type" : "CN_CHAR",
      "position" : 6
    },
    {
      "token" : "苦行",
      "start_offset" : 13,
      "end_offset" : 15,
      "type" : "CN_WORD",
      "position" : 7
    }
  ]
}

注意:

未安装IK分词器,使用ik_max_word会报错

安装IK分词器,未指定"analyzer": “ik_max_word”,使用默认分词器

IK分词模式

1、ik_max_word:对文本进行最细粒度的拆分

2、ik_smart:对文本进行最粗粒度的拆分

ik_max_word
POST _analyze
{
  "analyzer": "ik_max_word",
  "text": "我是中国人"
}

结果:

{
  "tokens" : [
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "是",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 1
    },
    {
      "token" : "中国人",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "中国",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "国人",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 4
    }
  ]
}
ik_smart
POST _analyze
{
  "analyzer": "ik_smart",
  "text": "我是中国人"
}

结果:

{
  "tokens" : [
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "是",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 1
    },
    {
      "token" : "中国人",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 2
    }
  ]
}
创建IK分词器的索引映射

text类型都使用ik_max_word分词模式


# 创建IK分词器索引映射,所有text类型都使用ik_max_word分词模式
PUT idx_user_ik_01
{
  "settings": {
    "analysis": {
      "analyzer": {
        "ik": {
          "tokenizer": "ik_max_word"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "user_id": {
        "type": "integer"
      },
      "user_name": {
        "type": "text",
        "fields": {
          "username": {
            "type": "keyword"
          }
        }
      },
      "age": {
        "type": "integer"
      },
      "address": {
        "type": "text"
      }
    }
  }
}

创建IK分词器索引映射,为每一个text指定分词模式

# 创建IK分词器索引映射,为每一个text指定分词模式
PUT idx_user_ik_02
{
  "mappings": {
    "properties": {
      "user_id": {
        "type": "integer"
      },
      "user_name": {
        "type": "text",
        "analyzer": "ik_smart",
        "search_analyzer": "ik_smart"
      },
      "age": {
        "type": "integer"
      },
      "address": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "standard"
      }
    }
  }
}

PUT idx_user_info_ik
{
  "mappings": {
    "properties": {
      "name": {
        "type": "keyword"
      },
      "address": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_smart"
      }
    }
  }
}

POST idx_user_info_ik/_doc
{
  "name":"曹操",
  "address":"魏国"
}

POST idx_user_info_ik/_doc
{
  "name":"诸葛亮",
  "address":"蜀国"
}

POST idx_user_info_ik/_doc
{
  "name":"周瑜",
  "address":"吴国"
}

POST idx_user_info_ik/_doc
{
  "name":"张飞",
  "address":"蜀国"
}

POST idx_user_info_ik/_search
{
  "query": {
    "match": {
      "address": {
        "query": "蜀国"
      }
    }
  }
}

查询结果:

{
  "took" : 19,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.6931471,
    "hits" : [
      {
        "_index" : "idx_user_info_ik",
        "_type" : "_doc",
        "_id" : "dQDywIYBnnKcIM3DPvhR",
        "_score" : 0.6931471,
        "_source" : {
          "name" : "诸葛亮",
          "address" : "蜀国"
        }
      },
      {
        "_index" : "idx_user_info_ik",
        "_type" : "_doc",
        "_id" : "dwDywIYBnnKcIM3DUvhX",
        "_score" : 0.6931471,
        "_source" : {
          "name" : "张飞",
          "address" : "蜀国"
        }
      }
    ]
  }
}
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值