系统分析器包含三个类型的分析器
character filter :在tokenizer之前进行文本处理,包括过滤特殊字符替换字符,例如html标签过滤,正则匹配过滤
tokenizer:将原始文本按照一定的规则切分成词 例如,whitespace/standard/uax_url_email/keyword/pattern/path_hierarchy(文件路径)
tokenizer filter:将tokenizer分好的词进行增加,删除,修改,例如:lowercase/stop/synonym(近义词)
自定义分析器
例子
//测试分词器 通过 _analyze进行测试分词器 POST _analyze { "tokenizer": "keyword", "char_filter": ["html_strip"], "text": ["<b>asdsadsa</b>","<a>sdafdsfdsa</a>"] } 结果: { "tokens" : [ { "token" : "asdsadsa", "start_offset" : 3, "end_offset" : 15, "type" : "word", "position" : 0 }, { "token" : "sdafdsfdsa", "start_offset" : 19, "end_offset" : 33, "type" : "word", "position" : 101 } ] } //通过mapping过滤器进行替换字符 POST _analyze { "tokenizer": "standard", "char_filter": [{ "type":"mapping", "mappings":["-=>_"] }], "text": ["123-456","I-test"] } 结果: { "tokens" : [ { "token" : "123_456", "start_offset" : 0, "end_offset" : 7, "type" : "<NUM>", "position" : 0 }, { "token" : "I_test", "start_offset" : 8, "end_offset" : 14, "type" : "<ALPHANUM>", "position" : 101 } ] } //正则表达式过滤 POST _analyze { "tokenizer": "standard", "char_filter": [{ "type":"pattern_replace", "pattern":"http://(.*)", "replacement":"$1" }], "text": ["http://www.baidu.com"] } 结果: { "tokens" : [ { "token" : "www.baidu.com", "start_offset" : 0, "end_offset" : 20, "type" : "<ALPHANUM>", "position" : 0 } ] } //文件路径分词器 POST _analyze { "tokenizer": "path_hierarchy", "text": ["/user/ym/a/b/c/d/e"] } 结果: { "tokens" : [ { "token" : "/user", "start_offset" : 0, "end_offset" : 5, "type" : "word", "position" : 0 }, { "token" : "/user/ym", "start_offset" : 0, "end_offset" : 8, "type" : "word", "position" : 0 }, { "token" : "/user/ym/a", "start_offset" : 0, "end_offset" : 10, "type" : "word", "position" : 0 }, { "token" : "/user/ym/a/b", "start_offset" : 0, "end_offset" : 12, "type" : "word", "position" : 0 }, { "token" : "/user/ym/a/b/c", "start_offset" : 0, "end_offset" : 14, "type" : "word", "position" : 0 }, { "token" : "/user/ym/a/b/c/d", "start_offset" : 0, "end_offset" : 16, "type" : "word", "position" : 0 }, { "token" : "/user/ym/a/b/c/d/e", "start_offset" : 0, "end_offset" : 18, "type" : "word", "position" : 0 } ] } //whitespace和stop POST _analyze { "tokenizer": "whitespace", "filter": ["stop"], "text": ["The rain in Spain falls mainly on the plain."] } 结果: { "tokens" : [ { "token" : "The", "start_offset" : 0, "end_offset" : 3, "type" : "word", "position" : 0 }, { "token" : "rain", "start_offset" : 4, "end_offset" : 8, "type" : "word", "position" : 1 }, { "token" : "Spain", "start_offset" : 12, "end_offset" : 17, "type" : "word", "position" : 3 }, { "token" : "falls", "start_offset" : 18, "end_offset" : 23, "type" : "word", "position" : 4 }, { "token" : "mainly", "start_offset" : 24, "end_offset" : 30, "type" : "word", "position" : 5 }, { "token" : "plain.", "start_offset" : 38, "end_offset" : 44, "type" : "word", "position" : 8 } ] } //小写转换 lowercase 如果将lowercase 和stop 的顺序颠倒后,the会被转换成小写并且不会被过滤 POST _analyze { "tokenizer": "whitespace", "filter": ["lowercase","stop"], "text": ["The rain in Spain falls mainly on the plain."] } 结果: { "tokens" : [ { "token" : "rain", "start_offset" : 4, "end_offset" : 8, "type" : "word", "position" : 1 }, { "token" : "spain", "start_offset" : 12, "end_offset" : 17, "type" : "word", "position" : 3 }, { "token" : "falls", "start_offset" : 18, "end_offset" : 23, "type" : "word", "position" : 4 }, { "token" : "mainly", "start_offset" : 24, "end_offset" : 30, "type" : "word", "position" : 5 }, { "token" : "plain.", "start_offset" : 38, "end_offset" : 44, "type" : "word", "position" : 8 } ] }
自定义分词器
PUT my_index { "settings": { "analysis": { "analyzer": { "my_custom_analyzer":{ "type":"custom", "char_filter":[//引用自定义char_filter "emoticons" ], "tokenizer":"punctuation",//引用自定义tokenizer "filter":["lowercase","english_stop"]//引用自定义filter } }, "tokenizer": {//创建自定义tokenizer punctuation "punctuation":{ "type":"pattern", "pattern":"[.,!?]" } }, "char_filter": {//创建自定义char_filter 名称为emoticons "emoticons":{ "type":"mapping", "mappings":[ ":)=>_happy_", ":(=>_sad_" ] } }, "filter": {//创建自定义filter english_stop "english_stop":{ "type":"stop", "stopwords":"_english_" } } } } } //自定义分词器测试
若有收获,就点个赞吧
03-21 15:08
1
0