Lucene.Net实现GroupBy的效果(2.3.1版)

本文简单介绍Lucene.Net实现GroupBy效果的方法,与《Lucene.Net 按类别统计搜索结果数 》一文类似。注意,这种使用方法很影响效率,特别是命中结果多的情况下。这段代码修正自2.3.1版本,其它版本可能会与此有差别。

改造方法仍然是修改IndexSearcher,这里不再修改类库,而是通过自己的代码来实现。

ContractedBlock.gif ExpandedBlockStart.gif 扩充IndexSearcher类
    /// <summary>
    
/// 增加了GroupBy功能的IndexSearcher
    
/// </summary>
    public class IndexSearcherExtension : IndexSearcher
    {
        
/// <summary>
        
/// 这里只用这一个构造函数,其它的就不再列出。
        
/// </summary>
        
/// <param name="path"></param>
        public IndexSearcherExtension(string path) : base(path) { }

        
/// <summary>
        
/// 增加GroupBy字段
        
/// </summary>
        private string fieldName;
        
/// <summary>
        
/// 给TopDocCollectorExtension类的Collect方法使用。
        
/// </summary>
        public string FieldName {
            
get { return fieldName; }
        }
        
/// <summary>
        
/// 在调用Search方法前一定要调用该方法。
        
/// </summary>
        
/// <param name="fieldName"></param>
        public void GroupBy(string fieldName) {
            
this.fieldName = fieldName;
        }
        
/// <summary>
        
/// 重写Seach方法,使其能调用构造好的方法。
        
/// </summary>
        
/// <param name="weight"></param>
        
/// <param name="filter"></param>
        
/// <param name="nDocs"></param>
        
/// <returns></returns>
        public override TopDocs Search(Weight weight, Filter filter, int nDocs)
        {
            
if (nDocs <= 0)
                
// null might be returned from hq.top() below.
                throw new System.ArgumentException("nDocs must be > 0");

            TopDocCollectorExtension collector 
= new TopDocCollectorExtension(nDocs, this);
            Search(weight, filter, collector);
            
return collector.TopDocs();
        }
    }

 

ContractedBlock.gif ExpandedBlockStart.gif 实现与 HitQueue类完全一致,只因为这里无法使用类库提供的构造函数
    /// <summary>
    
/// 实现与 HitQueue类完全一致,只因为这里无法使用类库提供的构造函数
    
/// </summary>
    public class HitQueueExtension : PriorityQueue
    {
        
internal HitQueueExtension(int size)
        {
            Initialize(size);
        }
        
        
public override bool LessThan(System.Object a, System.Object b)
        {
            ScoreDoc hitA 
= (ScoreDoc) a;
            ScoreDoc hitB 
= (ScoreDoc) b;
            
if (hitA.score == hitB.score)
                
return hitA.doc > hitB.doc;
            
else
                
return hitA.score < hitB.score;
        }
    }

 

     ///   <summary>
    
///  增加新的TopDocCollector类,无法直接继承TopDocCollector
    
///   </summary>
     public   class  TopDocCollectorExtension : HitCollector
    {
        
private  ScoreDoc reusableSD;

        
internal   int  totalHits;
        
internal  PriorityQueue hq;

        
///   <summary> Construct to collect a given number of hits. </summary>
        
///   <param name="numHits"> the maximum number of hits to collect
        
///   </param>
         public  TopDocCollectorExtension( int  numHits)
            : 
this (numHits,  new  HitQueueExtension(numHits))
        {
        }
        
///   <summary>
        
///  注入IndexSearcherExtension对象
        
///   </summary>
         private  IndexSearcherExtension searcher;
        
///   <summary>
        
///  构造函数注入对象
        
///   </summary>
        
///   <param name="numHits"></param>
        
///   <param name="searcher"></param>
         public  TopDocCollectorExtension( int  numHits, IndexSearcherExtension searcher)
            : 
this (numHits)
        {
            
this .searcher  =  searcher;
        }

        
internal  TopDocCollectorExtension( int  numHits, PriorityQueue hq)
        {
            
this .hq  =  hq;
        }

        
///   <summary>
        
///  临时数据,用于排重
        
///   </summary>
         private  Dictionary < int int >  dict  =   new  Dictionary < int int > ();
        
//  javadoc inherited
         public   override   void  Collect( int  doc,  float  score)
        {
            
if  (score  >   0.0f )
            {
                
// 排重算法
                 if  ( ! string .IsNullOrEmpty(searcher.FieldName))
                {
                    IndexReader reader 
=  searcher.GetIndexReader();
                    Document docment 
=  reader.Document(doc);
                    
string  value  =  docment.Get(searcher.FieldName).Trim();
                    
string  value1  =   string .Empty;
                    
string  value2  =   string .Empty;
                    
int  len  =  value.Length;
                    
int  len1  =  ( int )Math.Ceiling(len  /   2.0f );
                    
int  len2  =  len  -  len1;
                    
int  hash1  =  value.Substring( 0 , len1).GetHashCode();
                    
int  hash2  =  value.Substring(len1, len2).GetHashCode();
                    
if  ( ! (dict.ContainsKey(hash1)  &&  dict.ContainsValue(hash2)))
                        dict.Add(hash1, hash2);
                    
else
                        
return ;
                }

                totalHits
++ ;
                
if  (reusableSD  ==   null )
                {
                    reusableSD 
=   new  ScoreDoc(doc, score);
                }
                
else   if  (score  >=  reusableSD.score)
                {
                    
//  reusableSD holds the last "rejected" entry, so, if
                    
//  this new score is not better than that, there's no
                    
//  need to try inserting it
                    reusableSD.doc  =  doc;
                    reusableSD.score 
=  score;
                }
                
else
                {
                    
return ;
                }
                reusableSD 
=  (ScoreDoc)hq.InsertWithOverflow(reusableSD);
            }
        }

        
///   <summary> The total number of documents that matched this query.  </summary>
         public   virtual   int  GetTotalHits()
        {
            
return  totalHits;
        }

        
///   <summary> The top-scoring hits.  </summary>
         public   virtual  TopDocs TopDocs()
        {
            ScoreDoc[] scoreDocs 
=   new  ScoreDoc[hq.Size()];
            
for  ( int  i  =  hq.Size()  -   1 ; i  >=   0 ; i -- )
                
//  put docs in array
                scoreDocs[i]  =  (ScoreDoc)hq.Pop();

            
float  maxScore  =  (totalHits  ==   0 ?  System.Single.NegativeInfinity : scoreDocs[ 0 ].score;

            
return   new  TopDocs(totalHits, scoreDocs, maxScore);
        }
    }
OK生产者完成了,下面看看消费者怎么搞。
         static   void  Main( string [] args)
        {
            IndexWriter writer 
=   new  IndexWriter( " e:\\index " new  StandardAnalyzer(),  true );
            Document doc 
=   new  Document();
            doc.Add(
new  Field( " field " " query value! " , Field.Store.YES, Field.Index.TOKENIZED));
            writer.AddDocument(doc);
            writer.AddDocument(doc);
            writer.AddDocument(doc);
            writer.Close();

            IndexSearcherExtension searcher 
=   new  IndexSearcherExtension( " e:\\index " );
            searcher.GroupBy(
" field " );
            Query q 
=   new  QueryParser( " field " new  StandardAnalyzer())
                .Parse(
" query " );
            Hits docs 
=  searcher.Search(q);
            
for  ( int  i  =   0 ; i  <  docs.Length(); i ++ )
            {
                Console.WriteLine(docs.Doc(i).Get(
" field " ));
            }
            searcher.Close();

            Console.ReadKey();
        }
添加了三个相同的文档,结果只查询到一个结果,从而达到了目的。这段修改比较简单,应该还可以设计出更加高效的算法。好长时间没写博客有些生疏了~~!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值