ElasticSearch自定义分词器

真实开发中我们往往需要对一段内容既进行文字分词,又进行拼音分词,此时我们需要自定义ik+pinyin
分词器。

  1. 创建自定义分词器:
    在创建索引时自定义分词器
PUT /索引名
{
  "settings": {
    "analysis": {
      "analyzer": {
        "ik_pinyin":{//自定义分词器名
          "tokenizer":"ik_max_word",// 基本分词器
          "filter":"pinyin_filter"// 配置分词器过滤
        }
      },
      "filter": {// 分词器过滤时配置另一个分词器,相当于同时使用两个分词器
        "pinyin_filter":{
          "type":"pinyin",// 另一个分词器
          // 拼音分词器的配置
          "keep_separate_first_letter":false,// 是否分词每个字的首字母
          "keep_full_pinyin":true,// 是否分词全拼
          "keep_original":true,// 是否保留原始输入
          "remove_duplicated_term":true// 是否删除重复项
        }
      }
    }
  }
}
//为索引创建结构
POST /people/_mapping
{
  "properties":{
    "id":{
      "type":"integer",
      "store":true,
      "index":true
    },
    "name":{
      "type":"text",
      "store":true,
      "index":true,
      "analyzer":"ik_pinyin"
    },
    "desc":{
      "type":"text",
      "store":true,
      "index":true,
      "analyzer":"ik_max_word"
    }
  }
}
//添加一条记录
POST /people/_doc/1
{
  "id":1001,
  "name":"湖人俱乐部的科比",
  "desc":"科比是NBA最伟大的运动员"
}
  1. 测试自定义分词器
GET /索引/_analyze 
{ 
	"text": "科比是NBA最伟大的运动员", 
	"analyzer": "ik_pinyin" 
}
//测试结果
{
  "tokens" : [
    {
      "token" : "shi",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "是",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "s",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "n",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "ENGLISH",
      "position" : 1
    },
    {
      "token" : "ba",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "ENGLISH",
      "position" : 2
    },
    {
      "token" : "nba",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "ENGLISH",
      "position" : 2
    },
    {
      "token" : "zui",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "最",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "z",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "wei",
      "start_offset" : 7,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "da",
      "start_offset" : 7,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "伟大",
      "start_offset" : 7,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "wd",
      "start_offset" : 7,
      "end_offset" : 9,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "de",
      "start_offset" : 9,
      "end_offset" : 10,
      "type" : "CN_CHAR",
      "position" : 6
    },
    {
      "token" : "的",
      "start_offset" : 9,
      "end_offset" : 10,
      "type" : "CN_CHAR",
      "position" : 6
    },
    {
      "token" : "d",
      "start_offset" : 9,
      "end_offset" : 10,
      "type" : "CN_CHAR",
      "position" : 6
    },
    {
      "token" : "yun",
      "start_offset" : 10,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 7
    },
    {
      "token" : "dong",
      "start_offset" : 10,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 8
    },
    {
      "token" : "yuan",
      "start_offset" : 10,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 9
    },
    {
      "token" : "运动员",
      "start_offset" : 10,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 9
    },
    {
      "token" : "ydy",
      "start_offset" : 10,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 9
    },
    {
      "token" : "yun",
      "start_offset" : 10,
      "end_offset" : 12,
      "type" : "CN_WORD",
      "position" : 10
    },
    {
      "token" : "dong",
      "start_offset" : 10,
      "end_offset" : 12,
      "type" : "CN_WORD",
      "position" : 11
    },
    {
      "token" : "运动",
      "start_offset" : 10,
      "end_offset" : 12,
      "type" : "CN_WORD",
      "position" : 11
    },
    {
      "token" : "yd",
      "start_offset" : 10,
      "end_offset" : 12,
      "type" : "CN_WORD",
      "position" : 11
    },
    {
      "token" : "dong",
      "start_offset" : 11,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 12
    },
    {
      "token" : "yuan",
      "start_offset" : 11,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 13
    },
    {
      "token" : "动员",
      "start_offset" : 11,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 13
    },
    {
      "token" : "dy",
      "start_offset" : 11,
      "end_offset" : 13,
      "type" : "CN_WORD",
      "position" : 13
    }
  ]
}

按照中文分词检索:

GET /people/_search
{
  "query": {
    "term": {
      "name": {
      "value": "俱乐部"
      }
    }
  }
}
//结果
{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.3754495,
    "hits" : [
      {
        "_index" : "people",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.3754495,
        "_source" : {
          "id" : 1001,
          "name" : "湖人俱乐部的科比",
          "desc" : "科比是NBA最伟大的运动员"
        }
      }
    ]
  }
}

按照拼音检索:

GET /people/_search
{
  "query": {
    "term": {
      "name": {
      "value": "jlb"
      }
    }
  }
}
//结果
{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.3754495,
    "hits" : [
      {
        "_index" : "people",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.3754495,
        "_source" : {
          "id" : 1001,
          "name" : "湖人俱乐部的科比",
          "desc" : "科比是NBA最伟大的运动员"
        }
      }
    ]
  }
}
  • 0
    点赞
  • 5
    收藏
  • 打赏
    打赏
  • 0
    评论
©️2022 CSDN 皮肤主题:岁月 设计师:pinMode 返回首页
评论

打赏作者

yqq love yn

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值