一,分词器
分词器由三部分构成:character filter、tokenizer、tokenizer filter
二,主要的分词器
GET /_analyze
{
"analyzer": "standard",
"text":"I am a student"
}
// 使用某个索引的某个字段的分词器
POST lcy_test/_analyze
{
"field": "name",
"text":"Lebron James"
}
// 标准分词器,按词切分,转小写,不过滤助词等
GET /_analyze
{
"analyzer": "standard",
"text":"I am a good boy"
}
// 简单分词器,按非字符切分,转小写,不过滤助词等
GET /_analyze
{
"analyzer": "simple",
"text":"I am a-good boy"
}
// 空格分词器,按空格切分
GET /_analyze
{
"analyzer": "whitespace",
"text":"I am A-good boy"
}
// stop分词器,按非字符进行,去除助词等
GET /_analyze
{
"analyzer": "stop",
"text":"I am A-good boy"
}
// keyword分词器,不切分
GET /_analyze
{
"analyzer": "keyword",
"text":"I am A-good boy"
}
// pattern分词器,按正则表达式切分,可自定义正则表达式,默认按非字符切分
GET /_analyze
{
"analyzer": "pattern",
"text":"I am A-good boy"
}
mapping分词器
POST _analyze
{
"tokenizer": "keyword",
"char_filter": [
{
"type":"mapping",
"mappings":["== => am"]
}
]
, "text": "I == GOOD BOY"
}
html字符处理器,将html标签去除
POST _analyze
{
"tokenizer": "keyword",
"char_filter": [
{
"type":"mapping",
"mappings":["== => am"]
},
"html_strip"
]
, "text": "I == <b>GOOD<b/> BOY"
}
// 路径分词器
POST _analyze
{
"tokenizer": "keyword",
"char_filter": [
{
"type":"mapping",
"mappings":["== => am"]
},
"html_strip"
]
, "text": "I == <b>GOOD<b/> BOY"
}
// 后置处理,去除无实际意义的词,如助词、语气词
POST _analyze
{
"tokenizer": "whitespace",
"filter": ["lowercase","stop"],
"text": "The classroom is not here"
}
// 自定义analyzer
PUT lcy_test3
{
"settings": {
"analysis": {
"analyzer": {
"lcy_test3_ana":{
"type":"custom",
"char_filter":["lcy_test3_char_filter"],
"tokenizer":"lcy_test3_tokenizer",
"filter":["lowercase","lcy_test3_char_filter"]
}
},
"char_filter": {
"lcy_test3_char_filter":{
"type":"mapping",
"mappings":["== => 等等"]
}
},
"tokenizer": {
"lcy_test3_tokenizer":{
"type":"pattern",
"pattern":"[.,/?]"
}
},
"filter": {
"lcy_test3_char_filter":{
"type":"stop",
"stopwords":"hh"
}
}
}
}
}
POST lcy_test3/_analyze
{
"analyzer": "lcy_test3_ana",
"text":"I hh == for? me"
}