关闭

bobo-browse为Lucene添加分组统计

390人阅读 评论(0) 收藏 举报

  Bobo-browse是一个基于lucene的搜索结果分组统计开源插件,可以完成对搜索结果的分组面统计,比如在淘宝上搜索“衬衣”,在搜索结果顶上显示 “长袖衬衫(10321) 短袖衬衫(32561) ”等。
虽然Lucene 在3.2.0也提供了Grouping组件来提供分组统计功能,另作讨论,这里不做讨论。


      据Bobo-browse项目介绍, Linkedin.com使用了该组件。

      Bobo-browse仅关注搜索,对索引创建与它无关,索引的创建,继续使用标准的Lucene索引创建方法创建。
      Bobo-browse项目地址为 http://sna-projects.com/bobo/

以下是测试代码使用的组件及版本:
Bobo-browse: 2.5.0-rc1
Lucene: 3.4.0
log4j: 1.2.16
Fastutil: 6.4

 

测试代码:

复制代码
import java.util.*;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.search.*;
import org.apache.lucene.queryParser.*;

import com.browseengine.bobo.api.BoboBrowser;
import com.browseengine.bobo.api.BoboIndexReader;
import com.browseengine.bobo.api.Browsable;
import com.browseengine.bobo.api.BrowseFacet;
import com.browseengine.bobo.api.BrowseHit;
import com.browseengine.bobo.api.BrowseRequest;
import com.browseengine.bobo.api.BrowseResult;
import com.browseengine.bobo.api.FacetAccessible;
import com.browseengine.bobo.api.FacetSpec;
import com.browseengine.bobo.api.FacetSpec.FacetSortSpec;
import com.browseengine.bobo.facets.FacetHandler;
import com.browseengine.bobo.facets.impl.*;
import com.browseengine.bobo.facets.impl.SimpleFacetHandler;
import com.browseengine.bobo.facets.data.PredefinedTermListFactory;

public class TestBoboBrowse {

    private Directory indexDir = null;
    private Version luceneVersion = Version.LUCENE_34;
    
    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        TestBoboBrowse app = new TestBoboBrowse();
        app.test1();
    }

    public void test1(){
        
        try{
            indexDir = new RAMDirectory();  
            
            createIndex();
            searchTest();
            
            indexDir.close();
        }catch(Exception ex){
            ex.printStackTrace();
        }        
    }
    
    public void createIndex(){
        String[][] data = new String[][]{  
                     new String[] {"lenovo", "Intel",  "PC Core2 E8200 2GB DDR2 250GB 7200RPM 22LCD", "08998"},  
                     new String[] {"lenovo", "Intel",  "PC Core2 E8300 2GB DDR2 320GB 7200RPM 22LCD", "09998"},  
                     new String[] {"lenovo", "Intel",  "PC Core2 Q6600 2GB DDR2 320GB 7200RPM 22LCD", "11998"},  
                     new String[] {"lenovo", "Intel",  "PC Core2 QX9770 4GB DDR2 320GB 7200RPM RAID-1 22LCD", "19998"},  
                     new String[] {"lenovo", "Intel",  "PC pentium E2200 1GB DDR2 160GB 5400RPM 19LCD", "05998"},  
                     new String[] {"hp",       "Intel",  "PC pentium E2180 1GB DDR2 160GB 7200RPM 20LCD", "06398"},  
                     new String[] {"hp",       "Intel",  "PC Core2 E8200 2GB DDR2 250GB 5400RPM 22LCD", "08998"},  
                     new String[] {"hp",       "Intel",  "PC Core2 E6550 2GB DDR2 250GB 7200RPM 20LCD", "07398"},  
                     new String[] {"hp",       "Intel",   "PC Core2 QX6850 4GB DDR2 320GB 5400RPM 22LCD", "13998"},  
                     new String[] {"asus",    "AMD",  "PC Core2 QX9650 4GB DDR2 450GB 7200RPM 22LCD", "17998"},  
                     new String[] {"dell",     "AMD",  "PC Core2 athlon FX76 4GB DDR2 450GB 7200RPM 22LCD", "12998"}  
        };
        
        try{            
            Analyzer analyzer = new StandardAnalyzer(luceneVersion); 

           // Lucene Version >= 3.2.0 (Version.LUCENE_32)
            IndexWriterConfig indexConfig = new IndexWriterConfig(luceneVersion, analyzer);
            indexConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);            
            IndexWriter indexWriter = new IndexWriter(indexDir, indexConfig);
            
            // Lucene Version < 3.2.0 (Version.LUCENE_32)
            // IndexWriter indexWriter = new IndexWriter(indexDir, analyzer, true, MaxFieldLength.LIMITED);
            
            for(int i = 0; i < data.length; i++){  
                Document doc = new Document();  
                doc.add(new Field("vendor",data[i][0], Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.add(new Field("cpu",data[i][1], Field.Store.YES, Field.Index.NOT_ANALYZED)); 
                doc.add(new Field("desc",data[i][2], Field.Store.YES, Field.Index.ANALYZED)); 
                doc.add(new Field("price",data[i][3], Field.Store.YES, Field.Index.NOT_ANALYZED));  
                indexWriter.addDocument(doc);  
            }
            
            indexWriter.optimize();
            indexWriter.commit();
            indexWriter.close();        
        }catch(Exception ex){
            ex.printStackTrace();
        }           
    }
    
    private void searchTest(){
        try{
            String fieldName = "desc";  
            String keywords = "Core2";  
            QueryParser queryParser = new QueryParser(luceneVersion, fieldName, new StandardAnalyzer(luceneVersion));  
            Query query = queryParser.parse(keywords);                   
            
            IndexReader indexReader = IndexReader.open(indexDir,true);
            
            List<FacetHandler<?>> facetHandlers = new ArrayList<FacetHandler<?>>();
            facetHandlers.add(new SimpleFacetHandler("vendor"));
            facetHandlers.add(new SimpleFacetHandler("cpu"));
            
            //facetHandlers.add(new RangeFacetHandler("price", Arrays.asList(new String[]{"[* TO 09998]", "[09999 TO 11998]", "[11999 TO *]"})));            
            String[] ranges = new String[]{"[00000 TO 09999]", "[10000 TO 11998]", "[11999 TO 30000]"};
            facetHandlers.add(new RangeFacetHandler("price", new PredefinedTermListFactory(Integer.class, "0"), Arrays.asList(ranges)));
            
            BoboIndexReader boboIndexReader = BoboIndexReader.getInstance(indexReader,facetHandlers); 
            BrowseRequest browseRequest = new BrowseRequest();  
            browseRequest.setCount(10);  
            browseRequest.setOffset(0);  

            browseRequest.setQuery(query); 
            
            //SortField[] sortFields = new SortField[2]; //排序
            //sortFields[0] = new SortField("vendor", SortField.STRING, true);
           //sortFields[1] = new SortField("price", SortField.INT, true);            
           //browseRequest.setSort(sortFields); 
            
            FacetSpec facetSpec = new FacetSpec();  
            facetSpec.setMaxCount(10);// 搜索出来的标签数目  
            facetSpec.setOrderBy(FacetSortSpec.OrderHitsDesc);  
              
            browseRequest.setFacetSpec("vendor", facetSpec);  
            browseRequest.setFacetSpec("cpu", facetSpec);  
            browseRequest.setFacetSpec("price", facetSpec);  
              
            Browsable browser = new BoboBrowser(boboIndexReader);  
            BrowseResult browseResult = browser.browse(browseRequest);  
              
            int totalHits = browseResult.getNumHits();  
            BrowseHit[] browseHit = browseResult.getHits();  
  
            System.out.println("=====Total records: "+totalHits);  
                         
            // 获取分组统计结果  
            Map<String,FacetAccessible> facetMap = browseResult.getFacetMap();  
            if(facetMap.size() > 0){
                System.out.println("-------Vendor-----------------------");  
                
                FacetAccessible vendorFacets = facetMap.get("vendor");  
                List<BrowseFacet> facetVals = vendorFacets.getFacets();  
                for(BrowseFacet f:facetVals){  
                    System.out.println(f.getValue() + "(" + f.getHitCount() + ")");  
                }
                
                System.out.println("-------CPU----------------");  
                FacetAccessible cpuFacets = facetMap.get("cpu");  
                facetVals = cpuFacets.getFacets();  
                for(BrowseFacet f:facetVals){  
                    System.out.println(f.getValue() + "(" + f.getHitCount() + ")");  
                }
                
                System.out.println("-------Price----------------");  
                FacetAccessible priceFacets = facetMap.get("price");  
                facetVals = priceFacets.getFacets();  
                for(BrowseFacet f:facetVals){  
                    System.out.println(f.getValue() + "(" + f.getHitCount() + ")");  
                }                
            }
            
            boboIndexReader.close();
            indexReader.close();
        }catch(Exception ex){
            ex.printStackTrace();
        }        
    }
}
复制代码

 

输出结果:

复制代码
Total records: 9
-------Vendor-----------------------
lenovo(4)
hp(3)
asus(1)
dell(1)
-------CPU----------------
Intel(7)
AMD(2)
-------Price----------------
[00000 TO 09999](4)
[11999 TO 30000](4)
[10000 TO 11998](1)
复制代码

0
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:262588次
    • 积分:3529
    • 等级:
    • 排名:第9736名
    • 原创:29篇
    • 转载:389篇
    • 译文:2篇
    • 评论:9条
    最新评论