Lucene 评分机制二 Payload

最新推荐文章于 2019-06-19 11:13:09 发布

weixin_30817749

最新推荐文章于 2019-06-19 11:13:09 发布

阅读量178

点赞数

文章标签： java

原文链接：http://www.cnblogs.com/shm10/p/3628551.html

版权

这里使用的Lucene4.7.0和Lucene3.X稍有不同

有下面三段内容,我想对船一系列的搜索进行加分

　　bike car jeep truck bus boat

　　train car ship boat van subway

　　car plane taxi boat vessel railway

定义自定义的MyAnalyzer,实现对字段的有效载荷进行赋值

 1 package com.pera.lucene.score.payload;
 2 
 3 import java.io.Reader;
 4 
 5 import org.apache.lucene.analysis.Analyzer;
 6 import org.apache.lucene.analysis.Tokenizer;
 7 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 8 import org.apache.lucene.analysis.payloads.PayloadEncoder;
 9 import org.apache.lucene.util.Version;
10 
11 public class MyAnalyzer extends Analyzer
12 {
13 
14     private PayloadEncoder encoder;
15 
16     MyAnalyzer(PayloadEncoder encoder)
17     {
18         this.encoder = encoder;
19     }
20 
21     @Override
22     protected TokenStreamComponents createComponents(String fieldName, Reader reader)
23     {
24         // 用来解析空格分隔的各个类别
25         Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_47, reader); 
26         // 自定义的Filter,用来获取字段的Payload值
27         MyTokenFilter filter = new MyTokenFilter(source, encoder);
28 
29         return new TokenStreamComponents(source, filter);
30     }
31 
32 }

View Code

自定义TokenFilter来达到取得字段的PayLoad值或通过字段对PayLoad值进行分析赋值

 1 package com.pera.lucene.score.payload;
 2 
 3 import java.io.IOException;
 4 
 5 import org.apache.lucene.analysis.TokenFilter;
 6 import org.apache.lucene.analysis.TokenStream;
 7 import org.apache.lucene.analysis.payloads.PayloadEncoder;
 8 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 9 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
10 
11 public class MyTokenFilter extends TokenFilter
12 {
13     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
14     private final PayloadAttribute payAtt = addAttribute(PayloadAttribute.class);
15     private final PayloadEncoder encoder;
16 
17     public MyTokenFilter(TokenStream input, PayloadEncoder encoder)
18     {
19         super(input);
20         this.encoder = encoder;
21     }
22 
23     @Override
24     public boolean incrementToken() throws IOException
25     {
26         if (input.incrementToken())
27         {
28             String term = termAtt.toString();
29             if (App.scoreMap.containsKey(term))
30             {
31                 payAtt.setPayload(encoder.encode(App.scoreMap.get(term).toCharArray()));
32             } else
33             {
34                 payAtt.setPayload(null);
35             }
36             return true;
37         } else
38             return false;
39     }
40 
41 }

View Code

1     public static ImmutableMap<String, String> scoreMap = ImmutableMap.of("boat", "5f", "ship", "20f", "vessel", "100f");

View Code

自定义PayloadSimilarity继承DefaultSimilarity 重载scorePayload方法,在检索时获得之前设置的PayLoad值

 1 package com.pera.lucene.score.payload;
 2 
 3 import org.apache.lucene.analysis.payloads.PayloadHelper;
 4 import org.apache.lucene.search.similarities.DefaultSimilarity;
 5 import org.apache.lucene.util.BytesRef;
 6 
 7 public class PayloadSimilarity extends DefaultSimilarity
 8 {
 9     @Override
10     public float scorePayload(int doc, int start, int end, BytesRef payload)
11     {
12         return PayloadHelper.decodeFloat(payload.bytes);
13     }
14 }

View Code

建立索引需要将之前定义的Analyzer和PayloadSimilarity设置到Config中

 1 package com.pera.lucene.score.payload;
 2 
 3 import java.io.File;
 4 import java.io.IOException;
 5 import java.util.Date;
 6 
 7 import org.apache.lucene.analysis.Analyzer;
 8 import org.apache.lucene.analysis.payloads.FloatEncoder;
 9 import org.apache.lucene.document.Document;
10 import org.apache.lucene.document.Field.Store;
11 import org.apache.lucene.document.TextField;
12 import org.apache.lucene.index.IndexWriter;
13 import org.apache.lucene.index.IndexWriterConfig;
14 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
15 import org.apache.lucene.search.similarities.Similarity;
16 import org.apache.lucene.store.Directory;
17 import org.apache.lucene.store.FSDirectory;
18 import org.apache.lucene.util.Version;
19 
20 public class Indexing
21 {
22     public void indexPayload() throws IOException
23     {
24         Directory dir = FSDirectory.open(new File(App.indexPath));
25         Analyzer analyzer = new MyAnalyzer(new FloatEncoder());
26         Similarity similarity = new PayloadSimilarity();
27 
28         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
29         iwc.setOpenMode(OpenMode.CREATE).setSimilarity(similarity);
30         Date start = new Date();
31         System.out.println("Indexing to directory '" + App.indexPath + "'...");
32         IndexWriter writer = new IndexWriter(dir, iwc);
33         Document doc = new Document();
34         doc.add(new TextField("tools", "bike car jeep truck bus boat", Store.YES));
35         writer.addDocument(doc);
36 
37         doc = new Document();
38         doc.add(new TextField("tools", "train car ship boat van subway", Store.YES));
39         writer.addDocument(doc);
40 
41         doc = new Document();
42         doc.add(new TextField("tools", "car plane taxi boat vessel railway", Store.YES));
43         writer.addDocument(doc);
44 
45         writer.close();
46 
47         Date end = new Date();
48         System.out.println(end.getTime() - start.getTime() + " total milliseconds");
49     }
50 }

View Code

进行检索检索时要将PayloadSimilarity设置到searcher中

 1 package com.pera.lucene.score.payload;
 2 
 3 import java.io.File;
 4 import java.io.IOException;
 5 
 6 import org.apache.lucene.index.DirectoryReader;
 7 import org.apache.lucene.index.IndexReader;
 8 import org.apache.lucene.index.Term;
 9 import org.apache.lucene.queryparser.classic.ParseException;
10 import org.apache.lucene.search.BooleanClause.Occur;
11 import org.apache.lucene.search.BooleanQuery;
12 import org.apache.lucene.search.Explanation;
13 import org.apache.lucene.search.IndexSearcher;
14 import org.apache.lucene.search.ScoreDoc;
15 import org.apache.lucene.search.TopDocs;
16 import org.apache.lucene.search.payloads.AveragePayloadFunction;
17 import org.apache.lucene.search.payloads.PayloadTermQuery;
18 import org.apache.lucene.store.FSDirectory;
19 
20 public class Searching
21 {
22 
23     public void searchPayload() throws IOException, ParseException
24     {
25         IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(App.indexPath)));
26         IndexSearcher searcher = new IndexSearcher(reader);
27 
28         BooleanQuery bq = new BooleanQuery();
29 
30         PayloadTermQuery ptq1 = new PayloadTermQuery(new Term("tools", "ship"), new AveragePayloadFunction());
31         PayloadTermQuery ptq2 = new PayloadTermQuery(new Term("tools", "boat"), new AveragePayloadFunction());
32         PayloadTermQuery ptq3 = new PayloadTermQuery(new Term("tools", "vessel"), new AveragePayloadFunction());
33         
34         bq.add(ptq1, Occur.SHOULD);
35         bq.add(ptq2, Occur.SHOULD);
36         bq.add(ptq3, Occur.SHOULD);
37 
38         // 设置自定义的PayloadSimilarity
39         searcher.setSimilarity(new PayloadSimilarity()); 
40         TopDocs results = searcher.search(bq, 10);
41         ScoreDoc[] hits = results.scoreDocs;
42 
43         int numTotalHits = results.totalHits;
44         System.out.println(numTotalHits + " total matching documents");
45 
46         for (int i = 0; i < hits.length; i++)
47         {
48             int docId = hits[i].doc; // 文档编号
49             float lucene_score = hits[i].score;
50             String tools = searcher.doc(docId).get("tools");
51             System.out.println("DocId:" + docId + "\tLucene Score:" + lucene_score + "\tTools:" + tools);
52             Explanation explanation = searcher.explain(bq, docId);
53             System.out.println(explanation.toString());
54         }
55     }
56 }

View Code

检索结果可以看到Doc2的排序由于有了PayLoad值排名得到了提升

3 total matching documents
DocId:2    Lucene Score:16.750757    Tools:car plane taxi boat vessel railway
16.750757 = (MATCH) product of:
  25.126135 = (MATCH) sum of:
    0.3186112 = (MATCH) btq, product of:
      0.06372224 = weight(tools:boat in 2) [PayloadSimilarity], result of:
        0.06372224 = score(doc=2,freq=0.5 = phraseFreq=0.5
), product of:
          0.33736566 = queryWeight, product of:
            0.71231794 = idf(docFreq=3, maxDocs=3)
            0.4736167 = queryNorm
          0.18888181 = fieldWeight in 2, product of:
            0.70710677 = tf(freq=0.5), with freq of:
              0.5 = phraseFreq=0.5
            0.71231794 = idf(docFreq=3, maxDocs=3)
            0.375 = fieldNorm(doc=2)
      5.0 = AveragePayloadFunction.docScore()
    24.807524 = (MATCH) btq, product of:
      0.24807523 = weight(tools:vessel in 2) [PayloadSimilarity], result of:
        0.24807523 = score(doc=2,freq=0.5 = phraseFreq=0.5
), product of:
          0.66565174 = queryWeight, product of:
            1.4054651 = idf(docFreq=1, maxDocs=3)
            0.4736167 = queryNorm
          0.37268022 = fieldWeight in 2, product of:
            0.70710677 = tf(freq=0.5), with freq of:
              0.5 = phraseFreq=0.5
            1.4054651 = idf(docFreq=1, maxDocs=3)
            0.375 = fieldNorm(doc=2)
      100.0 = AveragePayloadFunction.docScore()
  0.6666667 = coord(2/3)

DocId:1    Lucene Score:3.5200772    Tools:train car ship boat van subway
3.5200772 = (MATCH) product of:
  5.2801156 = (MATCH) sum of:
    4.9615045 = (MATCH) btq, product of:
      0.24807523 = weight(tools:ship in 1) [PayloadSimilarity], result of:
        0.24807523 = score(doc=1,freq=0.5 = phraseFreq=0.5
), product of:
          0.66565174 = queryWeight, product of:
            1.4054651 = idf(docFreq=1, maxDocs=3)
            0.4736167 = queryNorm
          0.37268022 = fieldWeight in 1, product of:
            0.70710677 = tf(freq=0.5), with freq of:
              0.5 = phraseFreq=0.5
            1.4054651 = idf(docFreq=1, maxDocs=3)
            0.375 = fieldNorm(doc=1)
      20.0 = AveragePayloadFunction.docScore()
    0.3186112 = (MATCH) btq, product of:
      0.06372224 = weight(tools:boat in 1) [PayloadSimilarity], result of:
        0.06372224 = score(doc=1,freq=0.5 = phraseFreq=0.5
), product of:
          0.33736566 = queryWeight, product of:
            0.71231794 = idf(docFreq=3, maxDocs=3)
            0.4736167 = queryNorm
          0.18888181 = fieldWeight in 1, product of:
            0.70710677 = tf(freq=0.5), with freq of:
              0.5 = phraseFreq=0.5
            0.71231794 = idf(docFreq=3, maxDocs=3)
            0.375 = fieldNorm(doc=1)
      5.0 = AveragePayloadFunction.docScore()
  0.6666667 = coord(2/3)

DocId:0    Lucene Score:0.106203735    Tools:bike car jeep truck bus boat
0.106203735 = (MATCH) product of:
  0.3186112 = (MATCH) sum of:
    0.3186112 = (MATCH) btq, product of:
      0.06372224 = weight(tools:boat in 0) [PayloadSimilarity], result of:
        0.06372224 = score(doc=0,freq=0.5 = phraseFreq=0.5
), product of:
          0.33736566 = queryWeight, product of:
            0.71231794 = idf(docFreq=3, maxDocs=3)
            0.4736167 = queryNorm
          0.18888181 = fieldWeight in 0, product of:
            0.70710677 = tf(freq=0.5), with freq of:
              0.5 = phraseFreq=0.5
            0.71231794 = idf(docFreq=3, maxDocs=3)
            0.375 = fieldNorm(doc=0)
      5.0 = AveragePayloadFunction.docScore()
  0.33333334 = coord(1/3)

转载于:https://www.cnblogs.com/shm10/p/3628551.html

weixin_30817749

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene 评分机制二 Payload

这里使用的Lucene4.7.0和Lucene3.X稍有不同有下面三段内容,我想对船一系列的搜索进行加分　　bike car jeep truck bus boat　　train car ship boat van subway　　car plane taxi boat vessel railway定义自定义的MyAnalyzer,实现对字段的有效载荷进行赋值...
复制链接

扫一扫