分词练习
准备数据
post metric_zabbix/log
{
"@message":"fjdlakjfdklafjda EventType=10 fdsfada",
"@timestamp":"2019-02-22T17:45:49.000+08:00"
}
查看存进去的分词结果
post metric_zabbix/_analyze
{
"field":"@message",
"text":"fjdlakjfdklafjda EventType:10 fdsfada"
}
返回结果
{
"tokens": [
{
"token": "fjdlakjfdklafjda",
"start_offset": 0,
"end_offset": 16,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "eventtype",
"start_offset": 17,
"end_offset": 26,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "10",
"start_offset": 27,
"end_offset": 29,
"type": "<NUM>",
"position": 2
},
{
"token": "fdsfada",
"start_offset": 30,
"end_offset": 37,
"type": "<ALPHANUM>",
"position": 3
}
]
}
可以看到eventtype和10被分词,等号=其实类似于空格
再准备两条数据
post metric_zabbix/log
{
"@message":"fjdlakjfdklafjda EventType:10 fdsfada",
"@timestamp":"2019-02-22T17:45:49.000+08:00"
}
post metric_zabbix/log
{
"@message":"fjdlakjfdklafjda EventType=8",
"@timestamp":"2019-02-22T17:45:49.000+08:00"
}
这个时候索引内有三条数据,分别包括了EventType=10,EventType:10,EventType=8
首先拿一条语句查询EventType=10
post metric_zabbix/_search
{
"from" : 0,
"size" : 500,
"query" : {
"bool" : {
"must" : [
{
"query_string" : {
"query" : "EventType=10",
"fields" : [ ],
"use_dis_max" : true,
"tie_breaker" : 0.0,
"default_operator" : "or",
"auto_generate_phrase_queries" : false,
"max_determinized_states" : 10000,
"enable_position_increments" : true,
"fuzziness" : "AUTO",
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"phrase_slop" : 0,
"escape" : false,
"split_on_whitespace" : true,
"boost" : 1.0
}
}
]
}
}
}
返回结果
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1.0498221,
"hits": [
{
"_index": "metric_zabbix",
"_type": "log",
"_id": "ZojAXm0B32VSCY0zG_Zd",
"_score": 1.0498221,
"_source": {
"@message": "fjdlakjfdklafjda EventType=10 fdsfada",
"@timestamp": "2019-02-22T17:44:49.000+08:00"
}
},
{
"_index": "metric_zabbix",
"_type": "log",
"_id": "XI3aXm0B32VSCY0zOTxg",
"_score": 1.0498221,
"_source": {
"@message": "fjdlakjfdklafjda EventType:10 fdsfada",
"@timestamp": "2019-02-22T17:45:49.000+08:00"
}
},
{
"_index": "metric_zabbix",
"_type": "log",
"_id": "AIi_Xm0B32VSCY0z-PAp",
"_score": 0.35667494,
"_source": {
"@message": "fjdlakjfdklafjda EventType=8",
"@timestamp": "2019-02-22T17:44:49.000+08:00"
}
}
]
}
}
可以看到返回了三条结果,但是并不是我们想要的结果,因为查询的时候跟写入的时候的分词效果一样,都相当于把等于号=作为分词间隔,所有不论是eventtype或者是10,只要有一个能跟现有的文档分词匹配上,就可以返回,所以返回了三条结果,所以他跟下面的查询结果是一样的
post metric_zabbix/_search
{
"from" : 0,
"size" : 500,
"query" : {
"bool" : {
"must" : [
{
"query_string" : {
"query" : "EventType 10",
"fields" : [ ],
"use_dis_max" : true,
"tie_breaker" : 0.0,
"default_operator" : "or",
"auto_generate_phrase_queries" : false,
"max_determinized_states" : 10000,
"enable_position_increments" : true,
"fuzziness" : "AUTO",
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"phrase_slop" : 0,
"escape" : false,
"split_on_whitespace" : true,
"boost" : 1.0
}
}
]
}
}
}
因为存入文档时,es已经对文档进行了分词存储倒排索引,所以想要完全匹配到EventType=10的话,实现不了,但是我们可以通过改变查询条件,起码不会匹配到EventType=8这条记录,最起码返回两条EventType=10、EventType:10的结果
post metric_zabbix/_search
{
"from" : 0,
"size" : 500,
"query" : {
"bool" : {
"must" : [
{
"query_string" : {
"query" : "\"EventType=10\"",
"fields" : [ ],
"use_dis_max" : true,
"tie_breaker" : 0.0,
"default_operator" : "or",
"auto_generate_phrase_queries" : false,
"max_determinized_states" : 10000,
"enable_position_increments" : true,
"fuzziness" : "AUTO",
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"phrase_slop" : 0,
"analyze_wildcard" : true,
"escape" : false,
"split_on_whitespace" : true,
"boost" : 1.0
}
}
]
}
}
}
返回结果
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.0498221,
"hits": [
{
"_index": "metric_zabbix",
"_type": "log",
"_id": "ZojAXm0B32VSCY0zG_Zd",
"_score": 1.0498221,
"_source": {
"@message": "fjdlakjfdklafjda EventType=10 fdsfada",
"@timestamp": "2019-02-22T17:44:49.000+08:00"
}
},
{
"_index": "metric_zabbix",
"_type": "log",
"_id": "XI3aXm0B32VSCY0zOTxg",
"_score": 1.0498221,
"_source": {
"@message": "fjdlakjfdklafjda EventType:10 fdsfada",
"@timestamp": "2019-02-22T17:45:49.000+08:00"
}
}
]
}
}
自定义分词器
针对上边的情况,如果我们只想用空格来做分词,不想用等号=分词的话,可以手动创建一个针对索引的分词器
put metric_zabbix1/
{
"analysis": {
"analyzer": {
"comma": {
"type": "pattern",
"pattern": " "
}
}
}
}
返回结果
{
"acknowledged": true,
"shards_acknowledged": true,
"index": "metric_zabbix1"
}
注意,只能对新的索引来创建分词器
创建完之后,比如我们想要在@message字段上边使用自定义的分词器comma,那么就更新mapping,使得@message字段使用comma分词器
put metric_zabbix1/_mapping/log
{
"log": {
"properties": {
"@message": {
"type": "text",
"analyzer": "comma",
"search_analyzer": "comma",
"search_quote_analyzer": "comma"
}
}
}
}
其中的log,也就是索引下的某一个类型,对应着你索引一条新纪录时的type类型即可;
经过以上两步,索引metric_zabbix1上边的type类型为log的新建索引,只要有@message字段,它就会用空格分词,而不会使用等于号分词,测试一下。
post metric_zabbix1/_analyze
{
"field":"@message",
"text":"空格分词 EventType=10 fdsfada"
}
返回结果
{
"tokens": [
{
"token": "空格分词",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "eventtype=10",
"start_offset": 5,
"end_offset": 17,
"type": "word",
"position": 1
},
{
"token": "fdsfada",
"start_offset": 18,
"end_offset": 25,
"type": "word",
"position": 2
}
]
}
可以看到很明显的差别,eventtype=10没有被分词。
这个时候随便插入几条之前的测试数据
post metric_zabbix1/log
{
"@message":"空格分词 EventType:10 fdsfada",
"@timestamp":"2019-02-22T17:45:49.000+08:00"
}
{
"@message":"空格分词 EventType=10 fdsfada",
"@timestamp":"2019-02-22T17:45:49.000+08:00"
}
{
"@message":"空格分词 EventType=8 fdsfada",
"@timestamp":"2019-02-22T17:45:49.000+08:00"
}
{
"@message":"空格分词 EventType:8 fdsfada",
"@timestamp":"2019-02-22T17:45:49.000+08:00"
}
再进行查询
post metric_zabbix1/_search
{
"from" : 0,
"size" : 500,
"query" : {
"bool" : {
"must" : [
{
"query_string" : {
"query" : "EventType=10",
"fields" : [ ],
"use_dis_max" : true,
"tie_breaker" : 0.0,
"default_operator" : "or",
"auto_generate_phrase_queries" : false,
"max_determinized_states" : 10000,
"enable_position_increments" : true,
"fuzziness" : "AUTO",
"fuzzy_prefix_length" : 0,
"fuzzy_max_expansions" : 50,
"phrase_slop" : 0,
"escape" : false,
"split_on_whitespace" : true,
"boost" : 1.0
}
}
]
}
}
}
返回结果只有一条
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.2039728,
"hits": [
{
"_index": "metric_zabbix1",
"_type": "log",
"_id": "OpcbX20B32VSCY0zLuHk",
"_score": 1.2039728,
"_source": {
"@message": "空格分词 EventType=10 fdsfada",
"@timestamp": "2019-02-22T17:45:49.000+08:00"
}
}
]
}
}
本文学习并参考以下博客:
https://www.cnblogs.com/gmhappy/p/9472373.html
https://blog.csdn.net/u014591788/article/details/81946303
https://baijiahao.baidu.com/s?id=1609869808965712860&wfr=spider&for=pc