cnblog的排版好看很多,所以在这里建一篇分享博客。
-----------------------------------------------------------------------------------------------
扩展字典中的词会被筛选出来,扩展停止词中的词会被过滤掉
1.没有加入扩展字典 停止词字典用法
1) ik分词器
[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200OK
Content-Type: application/json; charset=UTF-8Content-Length: 725
{"tokens": [ {"token" : "自古","start_offset" : 0,"end_offset" : 2,"type" : "CN_WORD","position" : 0}, {"token" : "刀","start_offset" : 2,"end_offset" : 3,"type" : "CN_WORD","position" : 1}, {"token" : "扇","start_offset" : 3,"end_offset" : 4,"type" : "CN_WORD","position" : 2}, {"token" : "过","start_offset" : 4,"end_offset" : 5,"type" : "CN_CHAR","position" : 3}, {"token" : "背","start_offset" : 5,"end_offset" : 6,"type" : "CN_WORD","position" : 4}, {"token" : "刺","start_offset" : 6,"end_offset" : 7,"type" : "CN_CHAR","position" : 5} ]
}
2) ik_smart分词器
[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8Content-Length: 725
{"tokens": [ {"token" : "自古","start_offset" : 0,"end_offset" : 2,"type" : "CN_WORD","position" : 0}, {"token" : "刀","start_offset" : 2,"end_offset" : 3,"type" : "CN_WORD","position" : 1}, {"token" : "扇","start_offset" : 3,"end_offset" : 4,"type" : "CN_WORD","position" : 2}, {"token" : "过","start_offset" : 4,"end_offset" : 5,"type" : "CN_CHAR","position" : 3}, {"token" : "背","start_offset" : 5,"end_offset" : 6,"type" : "CN_WORD","position" : 4}, {"token" : "刺","start_offset" : 6,"end_offset" : 7,"type" : "CN_CHAR","position" : 5} ]
}
3) ik_max_word分词器
[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200OK
Content-Type: application/json; charset=UTF-8Content-Length: 725
{"tokens": [ {"token" : "自古","start_offset" : 0,"end_offset" : 2,"type" : "CN_WORD","position" : 0}, {"token" : "刀","start_offset" : 2,"end_offset" : 3,"type" : "CN_WORD","position" : 1}, {"token" : "扇","start_offset" : 3,"end_offset" : 4,"type" : "CN_WORD","position" : 2}, {"token" : "过","start_offset" : 4,"end_offset" : 5,"type" : "CN_CHAR","position" : 3}, {"token" : "背","start_offset" : 5,"end_offset" : 6,"type" : "CN_WORD","position" : 4}, {"token" : "刺","start_offset" : 6,"end_offset" : 7,"type" : "CN_CHAR","position" : 5} ]
}
2.加入自定义字典
扩展字典:用于创建分词的字典
停止字典:用于过滤的字典,也就是说,该字典的单词或者字符串都会进行过滤
test.dic
刀扇
背刺
teststop.dic
自古
过
/analysis-ik/config/IKAnalyzer.cfg.xml
1) ik分词器
[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200OK
Content-Type: application/json; charset=UTF-8Content-Length: 728
{"tokens": [ {"token" : "刀扇","start_offset" : 2,"end_offset" : 4,"type" : "CN_WORD","position" : 0}, {"token" : "刀","start_offset" : 2,"end_offset" : 3,"type" : "CN_WORD","position" : 1}, {"token" : "扇","start_offset" : 3,"end_offset" : 4,"type" : "CN_WORD","position" : 2}, {"token" : "背刺","start_offset" : 5,"end_offset" : 7,"type" : "CN_WORD","position" : 3}, {"token" : "背","start_offset" : 5,"end_offset" : 6,"type" : "CN_WORD","position" : 4}, {"token" : "刺","start_offset" : 6,"end_offset" : 7,"type" : "CN_CHAR","position" : 5} ]
}
2) ik_smart分词器
[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8Content-Length: 260
{"tokens": [ {"token" : "刀扇","start_offset" : 2,"end_offset" : 4,"type" : "CN_WORD","position" : 0}, {"token" : "背刺","start_offset" : 5,"end_offset" : 7,"type" : "CN_WORD","position" : 1} ]
}
3) ik_max_word分词器
[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200OK
Content-Type: application/json; charset=UTF-8Content-Length: 728
{"tokens": [ {"token" : "刀扇","start_offset" : 2,"end_offset" : 4,"type" : "CN_WORD","position" : 0}, {"token" : "刀","start_offset" : 2,"end_offset" : 3,"type" : "CN_WORD","position" : 1}, {"token" : "扇","start_offset" : 3,"end_offset" : 4,"type" : "CN_WORD","position" : 2}, {"token" : "背刺","start_offset" : 5,"end_offset" : 7,"type" : "CN_WORD","position" : 3}, {"token" : "背","start_offset" : 5,"end_offset" : 6,"type" : "CN_WORD","position" : 4}, {"token" : "刺","start_offset" : 6,"end_offset" : 7,"type" : "CN_CHAR","position" : 5} ]
}