Elasticsearch分词器
1、simple分词器
simple分词器是对字母文本进行分词拆分,并将分词后的内容转换成小写格式。
# 对指定内容根据simple分词器进行分词
POST _analyze
{
"analyzer": "simple",
"text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}
结果:
{
"tokens" : [
{
"token" : "a",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "good",
"start_offset" : 2,
"end_offset" : 6,
"type" : "word",
"position" : 1
},
{
"token" : "book",
"start_offset" : 7,
"end_offset" : 11,
"type" : "word",
"position" : 2
},
{
"token" : "may",
"start_offset" : 12,
"end_offset" : 15,
"type" : "word",
"position" : 3
},
{
"token" : "be",
"start_offset" : 16,
"end_offset" : 18,
"type" : "word",
"position" : 4
},
{
"token" : "among",
"start_offset" : 19,
"end_offset" : 24,
"type" : "word",
"position" : 5
},
{
"token" : "the",
"start_offset" : 25,
"end_offset" : 28,
"type" : "word",
"position" : 6
},
{
"token" : "best",
"start_offset" : 29,
"end_offset" : 33,
"type" : "word",
"position" : 7
},
{
"token" : "of",
"start_offset" : 34,
"end_offset" : 36,
"type" : "word",
"position" : 8
},
{
"token" : "friends",
"start_offset" : 37,
"end_offset" : 44,
"type" : "word",
"position" : 9
},
{
"token" : "it",
"start_offset" : 46,
"end_offset" : 48,
"type" : "word",
"position" : 10
},
{
"token" : "is",
"start_offset" : 49,
"end_offset" : 51,
"type" : "word",
"position" : 11
},
{
"token" : "the",
"start_offset" : 52,
"end_offset" : 55,
"type" : "word",
"position" : 12
},
{
"token" : "same",
"start_offset" : 56,
"end_offset" : 60,
"type" : "word",
"position" : 13
},
{
"token" : "today",
"start_offset" : 61,
"end_offset" : 66,
"type" : "word",
"position" : 14
},
{
"token" : "that",
"start_offset" : 67,
"end_offset" : 71,
"type" : "word",
"position" : 15
},
{
"token" : "it",
"start_offset" : 72,
"end_offset" : 74,
"type" : "word",
"position" : 16
},
{
"token" : "always",
"start_offset" : 75,
"end_offset" : 81,
"type" : "word",
"position" : 17
},
{
"token" : "was",
"start_offset" : 82,
"end_offset" : 85,
"type" : "word",
"position" : 18
},
{
"token" : "and",
"start_offset" : 87,
"end_offset" : 90,
"type" : "word",
"position" : 19
},
{
"token" : "it",
"start_offset" : 91,
"end_offset" : 93,
"type" : "word",
"position" : 20
},
{
"token" : "will",
"start_offset" : 94,
"end_offset" : 98,
"type" : "word",
"position" : 21
},
{
"token" : "never",
"start_offset" : 99,
"end_offset" : 104,
"type" : "word",
"position" : 22
},
{
"token" : "change",
"start_offset" : 105,
"end_offset" : 111,
"type" : "word",
"position" : 23
}
]
}
中文:
POST _analyze
{
"analyzer": "simple",
"text": "生活就是不断突破自我的过程。我们努力地向上,不仅是让世界看到我们,更是为了让自己看到世界。当我们一步一个脚印往前走时,就会发现,每一点进步,都在让我们的人生变得更辽阔。"
}
结果:
{
"tokens" : [
{
"token" : "生活就是不断突破自我的过程",
"start_offset" : 0,
"end_offset" : 13,
"type" : "word",
"position" : 0
},
{
"token" : "我们努力地向上",
"start_offset" : 14,
"end_offset" : 21,
"type" : "word",
"position" : 1
},
{
"token" : "不仅是让世界看到我们",
"start_offset" : 22,
"end_offset" : 32,
"type" : "word",
"position" : 2
},
{
"token" : "更是为了让自己看到世界",
"start_offset" : 33,
"end_offset" : 44,
"type" : "word",
"position" : 3
},
{
"token" : "当我们一步一个脚印往前走时",
"start_offset" : 45,
"end_offset" : 58,
"type" : "word",
"position" : 4
},
{
"token" : "就会发现",
"start_offset" : 59,
"end_offset" : 63,
"type" : "word",
"position" : 5
},
{
"token" : "每一点进步",
"start_offset" : 64,
"end_offset" : 69,
"type" : "word",
"position" : 6
},
{
"token" : "都在让我们的人生变得更辽阔",
"start_offset" : 70,
"end_offset" : 83,
"type" : "word",
"position" : 7
}
]
}
2、simple_pattern分词器
根据正则表达式进行分词的分词器。
# 创建索引映射,定义字段内容的分词的正则表达式
PUT idx_custom_simplete_pattern
{
"settings": {
"analysis": {
"analyzer": {
"custom_pattern_analyzer": {
"tokenizer": "custom_pattern_tokenizer"
}
},
"tokenizer": {
"custom_pattern_tokenizer": {
"type": "simple_pattern",
"pattern":"[0123456789]{3}"
}
}
}
}
}
POST idx_custom_simplete_pattern/_analyze
{
"analyzer": "custom_pattern_analyzer",
"text": "ads-132-754-235-23d-000"
}
结果:
{
"tokens" : [
{
"token" : "132",
"start_offset" : 4,
"end_offset" : 7,
"type" : "word",
"position" : 0
},
{
"token" : "754",
"start_offset" : 8,
"end_offset" : 11,
"type" : "word",
"position" : 1
},
{
"token" : "235",
"start_offset" : 12,
"end_offset" : 15,
"type" : "word",
"position" : 2
},
{
"token" : "000",
"start_offset" : 20,
"end_offset" : 23,
"type" : "word",
"position" : 3
}
]
}
3、simple_pattern_split分词器
simple_pattern_split指定分词符合,比simple_pattern分词功能更有限,但是分词效率高。
默认分词符号是空字符串。
# 创建索引映射,指定字段内容分词匹配符号
PUT idx_custom_simple_pattern_split
{
"settings": {
"analysis": {
"analyzer": {
"custom_simple_pattern_split_analyzer": {
"tokenizer": "custom_tokenizer"
}
},
"tokenizer": {
"custom_tokenizer": {
"type": "simple_pattern_split",
"pattern": "-"
}
}
}
}
}
POST idx_custom_simple_pattern_split/_analyze
{
"analyzer": "custom_simple_pattern_split_analyzer",
"text": "ads-132-754-235-23d-000"
}
结果:
{
"tokens" : [
{
"token" : "ads",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 0
},
{
"token" : "132",
"start_offset" : 4,
"end_offset" : 7,
"type" : "word",
"position" : 1
},
{
"token" : "754",
"start_offset" : 8,
"end_offset" : 11,
"type" : "word",
"position" : 2
},
{
"token" : "235",
"start_offset" : 12,
"end_offset" : 15,
"type" : "word",
"position" : 3
},
{
"token" : "23d",
"start_offset" : 16,
"end_offset" : 19,
"type" : "word",
"position" : 4
},
{
"token" : "000",
"start_offset" : 20,
"end_offset" : 23,
"type" : "word",
"position" : 5
}
]
}
4、standard分词器
standard分词器是默认分词器,基于Unicode文本分割算法进行分词。
# 使用标准分词器
POST _analyze
{
"analyzer": "standard",
"text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}
结果:
{
"tokens" : [
{
"token" : "a",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "good",
"start_offset" : 2,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "book",
"start_offset" : 7,
"end_offset" : 11,
"type" : "<ALPHANUM>",
"position" : 2
},
。。。。
{
"token" : "change",
"start_offset" : 105,
"end_offset" : 111,
"type" : "<ALPHANUM>",
"position" : 23
}
]
}
POST _analyze
{
"analyzer": "standard",
"text": "生活就是不断突破自我的过程。我们努力地向上,不仅是让世界看到我们,更是为了让自己看到世界。当我们一步一个脚印往前走时,就会发现,每一点进步,都在让我们的人生变得更辽阔。"
}
结果:
{
"tokens" : [
{
"token" : "生",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "活",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "就",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
。。。。
{
"token" : "辽",
"start_offset" : 81,
"end_offset" : 82,
"type" : "<IDEOGRAPHIC>",
"position" : 74
},
{
"token" : "阔",
"start_offset" : 82,
"end_offset" : 83,
"type" : "<IDEOGRAPHIC>",
"position" : 75
}
]
}
# 配置分词规则(==慎用!!!!==)
PUT idx_standard
{
"settings": {
"analysis": {
"analyzer": {
"english_analyzer":{
"type":"standard",
"max_token_length":5,
"stopwords":"_english_"
}
}
}
}
}
# 使用标准分词器
POST idx_standard/_analyze
{
"analyzer": "english_analyzer",
"text": "A good book may be among the best of friends. It is the same today that it always was, and it will never change. "
}
结果:
{
"tokens" : [
{
"token" : "good",
"start_offset" : 2,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "book",
"start_offset" : 7,
"end_offset" : 11,
"type" : "<ALPHANUM>",
"position" : 2
},
{
"token" : "may",
"start_offset" : 12,
"end_offset" : 15,
"type" : "<ALPHANUM>",
"position" : 3
},
{
"token" : "among",
"start_offset" : 19,
"end_offset" : 24,
"type" : "<ALPHANUM>",
"position" : 5
},
{
"token" : "best",
"start_offset" : 29,
"end_offset" : 33,
"type" : "<ALPHANUM>",
"position" : 7
},
{
"token" : "frien",
"start_offset" : 37,
"end_offset" : 42,
"type" : "<ALPHANUM>",
"position" : 9
},
{
"token" : "ds",
"start_offset" : 42,
"end_offset" : 44,
"type" : "<ALPHANUM>",
"position" : 10
},
{
"token" : "same",
"start_offset" : 56,
"end_offset" : 60,
"type" : "<ALPHANUM>",
"position" : 14
},
{
"token" : "today",
"start_offset" : 61,
"end_offset" : 66,
"type" : "<ALPHANUM>",
"position" : 15
},
{
"token" : "alway",
"start_offset" : 75,
"end_offset" : 80,
"type" : "<ALPHANUM>",
"position" : 18
},
{
"token" : "s",
"start_offset" : 80,
"end_offset" : 81,
"type" : "<ALPHANUM>",
"position" : 19
},
{
"token" : "never",
"start_offset" : 99,
"end_offset" : 104,
"type" : "<ALPHANUM>",
"position" : 24
},
{
"token" : "chang",
"start_offset" : 105,
"end_offset" : 110,
"type" : "<ALPHANUM>",
"position" : 25
},
{
"token" : "e",
"start_offset" : 110,
"end_offset" : 111,
"type" : "<ALPHANUM>",
"position" : 26
}
]
}
5、keyword和text区别
text类型字段会进行分词处理,然后分词后的单词建立倒排索引。
支持全文检索,不是聚合计算和排序。
keyword类型字段不会分词处理,直接根据字段内容进行倒排索引。
不支持全文检索,支持聚合计算和排序。
6、IK分词器
下载
在Github官网搜索Elasticsearch-analysis-ik
下载地址:https://github.com/medcl/elasticsearch-analysis-ik/releases
安装
在elasticsearch安装目录的plugins目录中创建ik子目录
[esuser@db plugins]$ mkdir -p /home/elasticsearch/elasticsearch-7.13.2/plugins/ik [esuser@db plugins]$ ll 总用量 0 drwxr-xr-x. 2 esuser esgroup 6 3月 8 16:45 ik [esuser@db plugins]$
切换到ik
[esuser@db ~]# cd /home/elasticsearch/elasticsearch-7.13.2/plugins/ik [esuser@db ik]#
上传elasticsearch-analysis-ik安装包,解压
unzip elasticsearch-analysis-ik-7.13.2.zip
重启ES
[esuser@db ~]# cd /home/elasticsearch/elasticsearch-7.13.2 [esuser@db ~]# ./bin/elasticsearch
测试
没有使用IK分词器
POST _analyze
{
"text": "内心没有分别心,就是真正的苦行"
}
结果:
{
"tokens" : [
{
"token" : "内",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "心",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "没",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "有",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "分",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "别",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
},
{
"token" : "心",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 6
},
{
"token" : "就",
"start_offset" : 8,
"end_offset" : 9,
"type" : "<IDEOGRAPHIC>",
"position" : 7
},
{
"token" : "是",
"start_offset" : 9,
"end_offset" : 10,
"type" : "<IDEOGRAPHIC>",
"position" : 8
},
{
"token" : "真",
"start_offset" : 10,
"end_offset" : 11,
"type" : "<IDEOGRAPHIC>",
"position" : 9
},
{
"token" : "正",
"start_offset" : 11,
"end_offset" : 12,
"type" : "<IDEOGRAPHIC>",
"position" : 10
},
{
"token" : "的",
"start_offset" : 12,
"end_offset" : 13,
"type" : "<IDEOGRAPHIC>",
"position" : 11
},
{
"token" : "苦",
"start_offset" : 13,
"end_offset" : 14,
"type" : "<IDEOGRAPHIC>",
"position" : 12
},
{
"token" : "行",
"start_offset" : 14,
"end_offset" : 15,
"type" : "<IDEOGRAPHIC>",
"position" : 13
}
]
}
使用IK分词器
POST _analyze
{
"analyzer": "ik_max_word",
"text": "内心没有分别心,就是真正的苦行"
}
结果:
{
"tokens" : [
{
"token" : "内心",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "没有",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "分别",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "心",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "就是",
"start_offset" : 8,
"end_offset" : 10,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "真正",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "的",
"start_offset" : 12,
"end_offset" : 13,
"type" : "CN_CHAR",
"position" : 6
},
{
"token" : "苦行",
"start_offset" : 13,
"end_offset" : 15,
"type" : "CN_WORD",
"position" : 7
}
]
}
注意:
未安装IK分词器,使用ik_max_word会报错
安装IK分词器,未指定"analyzer": “ik_max_word”,使用默认分词器
IK分词模式
1、ik_max_word:对文本进行最细粒度的拆分
2、ik_smart:对文本进行最粗粒度的拆分
ik_max_word
POST _analyze
{
"analyzer": "ik_max_word",
"text": "我是中国人"
}
结果:
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "是",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "中国人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "中国",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "国人",
"start_offset" : 3,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 4
}
]
}
ik_smart
POST _analyze
{
"analyzer": "ik_smart",
"text": "我是中国人"
}
结果:
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "是",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "中国人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 2
}
]
}
创建IK分词器的索引映射
text类型都使用ik_max_word分词模式
# 创建IK分词器索引映射,所有text类型都使用ik_max_word分词模式
PUT idx_user_ik_01
{
"settings": {
"analysis": {
"analyzer": {
"ik": {
"tokenizer": "ik_max_word"
}
}
}
},
"mappings": {
"properties": {
"user_id": {
"type": "integer"
},
"user_name": {
"type": "text",
"fields": {
"username": {
"type": "keyword"
}
}
},
"age": {
"type": "integer"
},
"address": {
"type": "text"
}
}
}
}
创建IK分词器索引映射,为每一个text指定分词模式
# 创建IK分词器索引映射,为每一个text指定分词模式
PUT idx_user_ik_02
{
"mappings": {
"properties": {
"user_id": {
"type": "integer"
},
"user_name": {
"type": "text",
"analyzer": "ik_smart",
"search_analyzer": "ik_smart"
},
"age": {
"type": "integer"
},
"address": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "standard"
}
}
}
}
PUT idx_user_info_ik
{
"mappings": {
"properties": {
"name": {
"type": "keyword"
},
"address": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
}
}
}
}
POST idx_user_info_ik/_doc
{
"name":"曹操",
"address":"魏国"
}
POST idx_user_info_ik/_doc
{
"name":"诸葛亮",
"address":"蜀国"
}
POST idx_user_info_ik/_doc
{
"name":"周瑜",
"address":"吴国"
}
POST idx_user_info_ik/_doc
{
"name":"张飞",
"address":"蜀国"
}
POST idx_user_info_ik/_search
{
"query": {
"match": {
"address": {
"query": "蜀国"
}
}
}
}
查询结果:
{
"took" : 19,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 0.6931471,
"hits" : [
{
"_index" : "idx_user_info_ik",
"_type" : "_doc",
"_id" : "dQDywIYBnnKcIM3DPvhR",
"_score" : 0.6931471,
"_source" : {
"name" : "诸葛亮",
"address" : "蜀国"
}
},
{
"_index" : "idx_user_info_ik",
"_type" : "_doc",
"_id" : "dwDywIYBnnKcIM3DUvhX",
"_score" : 0.6931471,
"_source" : {
"name" : "张飞",
"address" : "蜀国"
}
}
]
}
}