1.Character filters 字符过滤
GET /_analyze
{
"tokenizer": "keyword",
"char_filter": [{
"type" : "html_strip", //去除html标签
"escaped_tags" : ["p"] //要跳过的html元素
}],
"text": [""]
}
GET /_analyze
{
"tokenizer": "keyword",
"char_filter": [{
"type" : "pattern_replace", //正则替换
"pattern" : "\\d", //正则表达式
"replacement" : "2" //替换的数据
}, {
"type" : "pattern_replace",
"pattern" : "(\\d+)-(?=\\d)",
"replacement" : "$1_"
}],
"text": [""]
}
GET /_analyze
{
"tokenizer": "keyword",
"char_filter": [{
"type" : "mapping", //字符映射转换
"mappings" : [
"٠ => 0",
"١ => 1",
"٢ => 2",
"٣ => 3",
"٤ => 4",
"٥ => 5",
"٦ => 6",
"٧ => 7",
"٨ => 8",
"٩ => 9",
"a => 97",
"y => 是"
]
}],
"text": [""]
}
//----------------------------------------------------------------
PUT /索引名
{
"settings": {"number_of_shards": 1,
"analysis": {
"analyzer": {
"my_analyzer" : { //自定义过滤器
"tokenizer" : "keyword",
"char_filter" : [
"my_char_filter_html",
"my_char_filter_mapping",
"my_char_filter_pattern"
]
}
},
"char_filter": {
"my_char_filter_html" : {
"type" : "html_strip",
"escaped_tags" : "p"
},
"my_char_filter_mapping" : {
"type" : "mapping",
"mappings" : [
"a => 97",
"b => 98"
]
},
"my_char_filter_pattern" : {
"type" : "pattern_replace",
"pattern" : "-",
"replacement" : "_"
}
}
}
}
}
GET /索引名/_analyze
{
"analyzer": "my_analyzer", //放里面
"text": [""]
}
2.Tokenizer 文本切为分词(只能配置一个 )
PUT /my_index?pretty
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer" //放入定义的分词器
}
},
"tokenizer": {
"my_tokenizer": { //定义的分词器
"type": "standard", //分词器类型
"max_token_length": 6 //分隔后字符串最大长度
}
}
}
}
}
POST /my_index/_analyze?pretty
{
"analyzer": "my_analyzer",
"text":
}
3.Token filters 分词后再过滤
token过滤器接收令牌流,并且可以添加,删除或改变token。比如转小写、删除某些特殊的停用词、增加同义词等。
----------------------------------------------------------------------------
总结起来
PUT /索引名
{
"settings" : {
"analysis" : {
"analyzer" : {
"ik_pinyin" : { //自定义分词器名
"tokenizer":"ik_max_word", // 基本分词器
"char_filter" : [
"my_char_filter_html",
"my_char_filter_mapping",
"my_char_filter_pattern"
],
"filter":"pinyin_filter" // 配置分词器过滤
}
},
"char_filter": {
"my_char_filter_html" : {
"type" : "html_strip",
"escaped_tags" : "p"
},
"my_char_filter_mapping" : {
"type" : "mapping",
"mappings" : [
"a => 97",
"b => 98"
]
},
"my_char_filter_pattern" : {
"type" : "pattern_replace",
"pattern" : "-",
"replacement" : "_"
}
},
"filter" : { // 分词器过滤时配置另一个分词器,相当于同时使用两个分词器
"pinyin_filter" : {
"type" : "pinyin", // 另一个分词器
// 拼音分词器的配置
"keep_separate_first_letter" : false, // 是否分词每个字的首字母
"keep_full_pinyin" : true, // 是否分词全拼
"keep_original" : true, // 是否保留原始输入
"remove_duplicated_term" : true // 是否删除重复项
}
}
}
},
"mappings":{
"properties":{
"域名1":{
"type":域的类型,
"store":是否单独存储,
"index":是否创建索引,
"analyzer":分词器
},
"域名2":{
...
}
}
}
}