相关性和相关性算法
相关性
相关性算分描述了一个文档和查询语句匹配的程度,es会对每个匹配查询条件的结果进行算分
打分的本质是排序,将相关性高的文档排在最前面。es现在采用的是BM 25的算分机制
词频-term frequency-TF
检索词在一篇文档中出现的频率
算法:检索词的次数除以文档的总字数
度量一条查询和结果文档相关性的简单方法:简单讲搜索中的每一个词的TF进行相加
TF(区块链)+TF(的)+TF(应用)
stop word
过滤不必要的词进行算分
比如TF(的)是没有必要算分的
逆文档频率 IDF
DF:检索词在所有文档中出现的频率
inverse document frequency :简单的说就是:log(全部文档数/检索词出现的文档总数)
TF-IDF 本质上就是将TF求和变成加权求和
TF(区块链)*IDF(区块链)+TF(的)*IDF(的)+TF(应用)*IDF(应用)
POST testscore/_bulk {"index":{"_id":1}} {"content":"we use Elasticsearch to power the search"} {"index":{"_id":2}} {"content":"we like elasticsearch"} {"index":{"_id":3}} {"content":"the scoring of documents is caculated by the scoring formula"} {"index":{"_id":4}} {"content":"you know for search"} GET testscore/_search { "query": { "match": { "content": "elasticsearch" } } } 结果: { "took" : 2, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 2, "relation" : "eq" }, "max_score" : 0.8713851, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "2", "_score" : 0.8713851,//由于文档比较短,所以打分比较高 "_source" : { "content" : "we like elasticsearch" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "1", "_score" : 0.6489038, "_source" : { "content" : "we use Elasticsearch to power the search" } } ] } } GET testscore/_search { "query": { "match": { "content": "you" } } } 结果: { "took" : 3, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 1, "relation" : "eq" }, "max_score" : 1.3940738, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "4", "_score" : 1.3940738, "_source" : { "content" : "you know for search" } } ] } } GET testscore/_search { "query": { "match": { "content": "the" } } } 结果: { "took" : 3, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 2, "relation" : "eq" }, "max_score" : 0.8025915, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "3", "_score" : 0.8025915, "_source" : { "content" : "the scoring of documents is caculated by the scoring formula" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "1", "_score" : 0.6489038, "_source" : { "content" : "we use Elasticsearch to power the search" } } ] } } GET testscore/_search { "query": { "match": { "content": "the elasticsearch" } } } 结果: { "took" : 8, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 3, "relation" : "eq" }, "max_score" : 1.2978076, "hits" : [ { "_index" : "testscore", "_type" : "_doc", "_id" : "1", "_score" : 1.2978076, "_source" : { "content" : "we use Elasticsearch to power the search" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "2", "_score" : 0.8713851, "_source" : { "content" : "we like elasticsearch" } }, { "_index" : "testscore", "_type" : "_doc", "_id" : "3", "_score" : 0.8025915, "_source" : { "content" : "the scoring of documents is caculated by the scoring formula" } } ] } }
Boost Relevance---权重
Boosting是控制相关度的一种手段
boost的定义
当boost>1 打分的相关度相对性提高
当0<boost<1 打分的权重相对性降低
当boost<0 贡献负分
例:
POST /blogs/_bulk {"index":{"_id":1}} {"title":"Apple iPad","content":"Apple iPad,Apple iPad"} {"index":{"_id":2}} {"title":"Apple iPad,Apple iPad","content":"Apple iPad"} 给字段进行权重处理 GET blogs/_search { "query": { "bool": { "should": [ { "match": { "title": { "query": "Apple iPad", "boost": 1.1 } } }, { "match": { "content": { "query": "Apple iPad", "boost": 1 } } } ] } } } 结果: { "took" : 2, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 2, "relation" : "eq" }, "max_score" : 0.5353899, "hits" : [ { "_index" : "blogs", "_type" : "_doc", "_id" : "2", "_score" : 0.5353899, "_source" : { "title" : "Apple iPad,Apple iPad", "content" : "Apple iPad" } }, { "_index" : "blogs", "_type" : "_doc", "_id" : "1", "_score" : 0.5332985, "_source" : { "title" : "Apple iPad", "content" : "Apple iPad,Apple iPad" } } ] } } GET blogs/_search { "query": { "bool": { "should": [ { "match": { "title": { "query": "Apple iPad", "boost": 1.1 } } }, { "match": { "content": { "query": "Apple iPad", "boost": 2 } } } ] } } } 结果: { "took" : 4, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 2, "relation" : "eq" }, "max_score" : 0.798205, "hits" : [ { "_index" : "blogs", "_type" : "_doc", "_id" : "1", "_score" : 0.798205, "_source" : { "title" : "Apple iPad", "content" : "Apple iPad,Apple iPad" } }, { "_index" : "blogs", "_type" : "_doc", "_id" : "2", "_score" : 0.77938265, "_source" : { "title" : "Apple iPad,Apple iPad", "content" : "Apple iPad" } } ] } } 例2: POST /news/_bulk {"index":{"_id":1}} {"content":"Apple Mac"} {"index":{"_id":2}} {"content":"Apple iPad"} {"index":{"_id":3}} {"content":"Apple employee like Apple Pie and Apple Juice"} GET news/_search { "query": { "match": { "content": "Apple" } } } 结果: { "took" : 1, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 3, "relation" : "eq" }, "max_score" : 0.17280532, "hits" : [ { "_index" : "news", "_type" : "_doc", "_id" : "3", "_score" : 0.17280532, "_source" : { "content" : "Apple employee like Apple Pie and Apple Juice"//由于Apple出现的频率比较高,所以分数比较高 } }, { "_index" : "news", "_type" : "_doc", "_id" : "1", "_score" : 0.16786805, "_source" : { "content" : "Apple Mac" } }, { "_index" : "news", "_type" : "_doc", "_id" : "2", "_score" : 0.16786805, "_source" : { "content" : "Apple iPad" } } ] } } 如果我们加上权重,让苹果的产品排在最上面如何做呢? GET news/_search { "query": { "boosting": { "positive": { "match": { "content": "Apple" } }, "negative": { "match": { "content": "Pie" } }, "negative_boost": 0.2 } } } 结果:苹果的产品在最上面,有关于Pie的查询仅仅是相关而非匹配 { "took" : 2, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 3, "relation" : "eq" }, "max_score" : 0.16786805, "hits" : [ { "_index" : "news", "_type" : "_doc", "_id" : "1", "_score" : 0.16786805, "_source" : { "content" : "Apple Mac" } }, { "_index" : "news", "_type" : "_doc", "_id" : "2", "_score" : 0.16786805, "_source" : { "content" : "Apple iPad" } }, { "_index" : "news", "_type" : "_doc", "_id" : "3", "_score" : 0.034561064, "_source" : { "content" : "Apple employee like Apple Pie and Apple Juice" } } ] } }