ES自定义分词:三部曲

1.Character filters 字符过滤

GET /_analyze
{
  "tokenizer": "keyword",
  "char_filter": [{        
    "type" : "html_strip",    //去除html标签 
    "escaped_tags" : ["p"]    //要跳过的html元素
  }],
  "text": [""]
}

GET /_analyze
{
  "tokenizer": "keyword",
  "char_filter": [{
    "type" : "pattern_replace",    //正则替换
    "pattern" : "\\d",            //正则表达式
    "replacement" : "2"            //替换的数据
  }, {
    "type" : "pattern_replace",
    "pattern" : "(\\d+)-(?=\\d)",
    "replacement" : "$1_"
  }],
  "text": [""]
}

GET /_analyze
{
  "tokenizer": "keyword",
  "char_filter": [{
    "type" : "mapping",    //字符映射转换
    "mappings" : [
        "٠ => 0",
        "١ => 1",
        "٢ => 2",
        "٣ => 3",
        "٤ => 4",
        "٥ => 5",
        "٦ => 6",
        "٧ => 7",
        "٨ => 8",
        "٩ => 9",
        "a => 97",
        "y => 是"
      ]
  }],
  "text": [""]
}
//----------------------------------------------------------------
PUT /索引名
{
  "settings": {"number_of_shards": 1,
    "analysis": {
      "analyzer": {
        "my_analyzer" : {            //自定义过滤器
          "tokenizer" : "keyword",
          "char_filter" : [
            "my_char_filter_html",
            "my_char_filter_mapping",
            "my_char_filter_pattern"
            ]
        }
      },
      "char_filter": {
        "my_char_filter_html" : {
          "type" : "html_strip",
          "escaped_tags" : "p"
        },
        "my_char_filter_mapping" : {
          "type" : "mapping",
          "mappings" : [
              "a => 97",
              "b => 98"
            ]
        },
        "my_char_filter_pattern" : {
          "type" : "pattern_replace",
          "pattern" : "-",
          "replacement" : "_"
        }
      }
    }
  }
}

GET /索引名/_analyze
{
  "analyzer": "my_analyzer",        //放里面
  "text": [""]
}

2.Tokenizer 文本切为分词(只能配置一个

PUT /my_index?pretty
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "my_tokenizer"    //放入定义的分词器
        }
      },
      "tokenizer": {
        "my_tokenizer": {                //定义的分词器
          "type": "standard",            //分词器类型
          "max_token_length": 6          //分隔后字符串最大长度
        }
      }
    }
  }
}


POST /my_index/_analyze?pretty
{
  "analyzer": "my_analyzer",
  "text": 
}

3.Token filters 分词后再过滤

token过滤器接收令牌流,并且可以添加,删除或改变token。比如转小写、删除某些特殊的停用词、增加同义词等。

----------------------------------------------------------------------------

总结起来

PUT /索引名
{
  "settings" : {
    "analysis" : {
      "analyzer" : {
        "ik_pinyin" : {                       //自定义分词器名
          "tokenizer":"ik_max_word",          // 基本分词器
          "char_filter" : [
            "my_char_filter_html",
            "my_char_filter_mapping",
            "my_char_filter_pattern"
            ],
          "filter":"pinyin_filter"            // 配置分词器过滤
         }
       },
       "char_filter": {
        "my_char_filter_html" : {
          "type" : "html_strip",
          "escaped_tags" : "p"
        },
        "my_char_filter_mapping" : {
          "type" : "mapping",
          "mappings" : [
              "a => 97",
              "b => 98"
            ]
        },
        "my_char_filter_pattern" : {
          "type" : "pattern_replace",
          "pattern" : "-",
          "replacement" : "_"
        }
      },
      "filter" : { // 分词器过滤时配置另一个分词器,相当于同时使用两个分词器
        "pinyin_filter" : { 
          "type" : "pinyin",                         // 另一个分词器
          // 拼音分词器的配置
          "keep_separate_first_letter" : false,         // 是否分词每个字的首字母
          "keep_full_pinyin" : true,                    // 是否分词全拼
          "keep_original" : true,                       // 是否保留原始输入
          "remove_duplicated_term" : true               // 是否删除重复项
         }
       }
     }
   },
  "mappings":{
    "properties":{
      "域名1":{
        "type":域的类型,
        "store":是否单独存储,
        "index":是否创建索引,
             "analyzer":分词器
       },
      "域名2":{ 
        ...
       }
     }
   }
}

  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值