lucene的多种搜索2-SpanQuery

SpanQuery按照词在文章中的距离或者查询几个相邻词的查询

SpanQuery包括以下几种:

SpanTermQuery:词距查询的基础,结果和TermQuery相似,只不过是增加了查询结果中单词的距离信息。

SpanFirstQuery:在指定距离可以找到第一个单词的查询。

SpanNearQuery:查询的几个语句之间保持者一定的距离。

SpanOrQuery:同时查询几个词句查询。

SpanNotQuery:从一个词距查询结果中,去除一个词距查询。

下面一个简单例子介绍

Java代码
package com;   
  
//SpanQuery:跨度查询。此类为抽象类。   
  
import java.io.IOException;   
import java.io.StringReader;   
import java.util.ArrayList;   
import java.util.List;   
import org.apache.lucene.analysis.Analyzer;   
import org.apache.lucene.analysis.Token;   
import org.apache.lucene.analysis.TokenStream;   
import org.apache.lucene.analysis.WhitespaceAnalyzer;   
import org.apache.lucene.document.Document;   
import org.apache.lucene.document.Field;   
import org.apache.lucene.document.Field.Index;   
import org.apache.lucene.document.Field.Store;   
import org.apache.lucene.index.IndexReader;   
import org.apache.lucene.index.IndexWriter;   
import org.apache.lucene.index.Term;   
import org.apache.lucene.search.Hits;   
import org.apache.lucene.search.IndexSearcher;   
import org.apache.lucene.search.spans.SpanFirstQuery;   
import org.apache.lucene.search.spans.SpanNearQuery;   
import org.apache.lucene.search.spans.SpanNotQuery;   
import org.apache.lucene.search.spans.SpanOrQuery;   
import org.apache.lucene.search.spans.SpanQuery;   
import org.apache.lucene.search.spans.SpanTermQuery;   
import org.apache.lucene.search.spans.Spans;   
import org.apache.lucene.store.RAMDirectory;   
  
public class SpanQueryTest {   
  
    private RAMDirectory directory;   
  
    private IndexSearcher indexSearcher;   
  
    private IndexReader reader;   
  
    private SpanTermQuery quick;   
  
    private SpanTermQuery brown;   
  
    private SpanTermQuery red;   
  
    private SpanTermQuery fox;   
  
    private SpanTermQuery lazy;   
  
    private SpanTermQuery sleepy;   
  
    private SpanTermQuery dog;   
  
    private SpanTermQuery cat;   
  
    private Analyzer analyzer;   
       
    // 索引及初使化   
    public void index() throws IOException {   
  
        directory = new RAMDirectory();   
  
        analyzer = new WhitespaceAnalyzer();   
  
        IndexWriter writer = new IndexWriter(directory, analyzer, true);   
  
        Document doc1 = new Document();   
  
        doc1.add(new Field("field",   
                "the quick brown fox jumps over the lazy dog", Store.YES,   
                Index.TOKENIZED));   
  
        Document doc2 = new Document();   
  
        doc2.add(new Field("field",   
                "the quick red fox jumps over the sleepy cat", Store.YES,   
                Index.TOKENIZED));   
  
        writer.addDocument(doc1);   
  
        writer.addDocument(doc2);   
  
        writer.optimize();   
  
        writer.close();   
  
        quick = new SpanTermQuery(new Term("field", "quick"));   
  
        brown = new SpanTermQuery(new Term("field", "brown"));   
  
        red = new SpanTermQuery(new Term("field", "red"));   
  
        fox = new SpanTermQuery(new Term("field", "fox"));   
        lazy = new SpanTermQuery(new Term("field", "lazy"));   
        sleepy = new SpanTermQuery(new Term("field", "sleepy"));   
        dog = new SpanTermQuery(new Term("field", "dog"));   
        cat = new SpanTermQuery(new Term("field", "cat"));   
  
        indexSearcher = new IndexSearcher(directory);   
  
        reader = IndexReader.open(directory);   
    }   
  
    private void dumpSpans(SpanQuery query) throws IOException {   
  
        // 检索效果和TermQuery一样,可以把他当成TermQuery   
        Hits hits = indexSearcher.search(query);   
        for (int i = 0; i < hits.length(); i++) {   
            // System.out.println(hits.doc(i).get("field"));   
        }   
  
        // 但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。   
  
        Spans spans = query.getSpans(reader);   
  
        int numSpans = 0;   
  
        float[] scores = new float[2];   
        for (int i = 0; i < hits.length(); i++) {   
            scores[hits.id(i)] = hits.score(i);   
        }   
  
        while (spans.next()) {   
  
            numSpans++;   
  
            int id = spans.doc();   
  
            Document doc = reader.document(id);   
  
            Token[] tokens = AnalyzerUtils.tokensFromAnalysis(analyzer, doc   
                    .get("field"));   
  
            StringBuffer buffer = new StringBuffer();   
  
            for (int i = 0; i < tokens.length; i++) {   
                // the quick brown fox jumps over the lazy dog   
                // spans记录了位置信息,比如搜索brown,brown在这句话中位于第三个位置,所以spans.start()=2,spans.end()=3   
                // 在第二项的位置后加<,第三项后加> 返回<brown>   
                if (i == spans.start()) {   
                    buffer.append("<");   
                }   
                buffer.append(tokens[i].termText());   
                if (i + 1 == spans.end()) {   
                    buffer.append(">");   
                }   
                buffer.append(" ");   
            }   
            buffer.append("(" + scores[id] + ") ");   
  
            System.out.println(buffer);   
        }   
  
        // indexSearcher.close();   
    }   
  
    // SpanTermQuery:检索效果完全同TermQuery,但内部会记录一些位置信息,供SpanQuery的其它API使用,是其它属于SpanQuery的Query的基础。   
    public void spanTermQueryTest() throws IOException {   
        dumpSpans(brown);   
           
        搜索结果   
        // the quick <brown> fox jumps over the lazy dog (0.22097087)    
    }   
  
    // SpanFirstQuery:查找方式为从Field的内容起始位置开始,在一个固定的宽度内查找所指定的词条。   
    public void spanFirstQueryTest() throws IOException {   
        // the quick brown fox jumps over the lazy dog   
        // 在给定的范围搜索,前两个为the quick   
        // brown 在doc1的第三个位置,用SpanFirstQuery从起点查找的话,他的跨度必须为>=3才能找到   
        SpanFirstQuery firstQuery = new SpanFirstQuery(brown, 3);   
        dumpSpans(firstQuery);   
           
        搜索结果   
        // the quick <brown> fox jumps over the lazy dog (0.22097087)    
    }   
  
    // SpanNearQuery:功能类似PharaseQuery。SpanNearQuery查找所匹配的不一定是短语,还有可能是另一个SpanQuery的查询结果作为整体考虑,进行嵌套查询。   
    public void spanNearQueryTest() throws IOException {   
        // the quick brown fox jumps over the lazy dog   
  
        // 第二个参数为两个项的位置之间允许的最大间隔   
        // 在这里两个较远的项为quick和fox,他们之是的最大间隔为5,所以slop必须>=5才能搜到结果   
        SpanNearQuery nearQuery = new SpanNearQuery(new SpanQuery[] { quick,   
                brown, fox }, 5, true);   
  
        dumpSpans(nearQuery);   
  
        // 与PhraseQuery短语搜索相似   
        // 这里搜索quick,dog,brown,要想得到结果,就要将brown向后移动5个位置才能到dog的后面,所以slop要>=5才能找到结果   
        // 第三个参数,如果为true表示保持各项位置不变,顺序搜索   
        nearQuery = new SpanNearQuery(new SpanQuery[] { quick, dog, brown }, 5,   
                false);   
  
        dumpSpans(nearQuery);   
           
        //搜索结果/   
        // 第一个dumpSpans的结果 the <quick brown fox> jumps over the lazy dog (0.34204215)    
        // 第二个dumpSpans的结果 the <quick brown fox jumps over the lazy dog> (0.27026406)    
    }   
  
    // 从第一个SpanQuery查询结果中,去掉第二个SpanQuery查询结果,作为检索结果   
    public void spanNotQueryTest() throws IOException {   
           
        // the quick brown fox jumps over the lazy dog   
  
        SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick,   
                fox }, 1, true);   
  
        // 结果为quick brown fox 和 quick red fox   
        dumpSpans(quick_fox);   
  
        // SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog);   
        //   
        // dumpSpans(quick_fox_dog);   
  
        // 在quick_fox结果中,去掉red,结果为quick brown fox   
        SpanNotQuery no_quick_red_fox = new SpanNotQuery(quick_fox, red);   
  
        dumpSpans(no_quick_red_fox);   
           
        //搜索结果///第一个dumpSpans结果为前两条,第二个dumpSpans结果为第三条   
        //the <quick brown fox> jumps over the lazy dog (0.18579213)    
        //the <quick red fox> jumps over the sleepy cat (0.18579213)    
        //the <quick brown fox> jumps over the lazy dog (0.18579213)    
    }   
  
    // SpanOrQuery:把所有SpanQuery查询结果综合起来,作为检索结果。   
    public void spanOrQueryTest() throws IOException   {   
           
        SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick,   
                fox }, 1, true);   
           
        SpanNearQuery lazy_dog = new SpanNearQuery(   
                new SpanQuery[] { lazy, dog }, 0, true);   
  
        SpanNearQuery sleepy_cat = new SpanNearQuery(new SpanQuery[] { sleepy,   
                cat }, 0, true);   
  
        SpanNearQuery qf_near_ld = new SpanNearQuery(new SpanQuery[] {   
                quick_fox, lazy_dog }, 3, true);   
           
        dumpSpans(qf_near_ld);   
  
        SpanNearQuery qf_near_sc = new SpanNearQuery(new SpanQuery[] {   
                quick_fox, sleepy_cat }, 3, true);   
           
        dumpSpans(qf_near_sc);   
  
        SpanOrQuery or = new SpanOrQuery(new SpanQuery[] { qf_near_ld,   
                qf_near_sc });   
           
        dumpSpans(or);   
           
        /搜索结果 第一个dumpSpans结果为第一条,第二个为第二条,第三个为第三,四条   
        // the <quick brown fox jumps over the lazy dog> (0.3321948)    
        // the <quick red fox jumps over the sleepy cat> (0.3321948)    
        // the <quick brown fox jumps over the lazy dog> (0.5405281)    
        // the <quick red fox jumps over the sleepy cat> (0.5405281)    
    }   
  
    public static void main(String[] args) throws IOException {   
  
        SpanQueryTest test = new SpanQueryTest();   
  
        test.index();   
  
        test.spanOrQueryTest();   
    }   
}   
  
class AnalyzerUtils {   
    public static Token[] tokensFromAnalysis(Analyzer analyzer, String text)   
            throws IOException {   
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(   
                text));   
        boolean b = true;   
        List<Token> list = new ArrayList<Token>();   
        while (b) {   
            Token token = stream.next();   
            if (token == null)   
                b = false;   
            else  
                list.add(token);   
        }   
        return (Token[]) list.toArray(new Token[0]);   
    }   
}  


本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/cool_rain_man/archive/2008/04/03/2247604.aspx

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值