Lucene.Net实现GroupBy的效果(2.3.1版)

最新推荐文章于 2021-07-22 10:43:08 发布

weixin_34344677

最新推荐文章于 2021-07-22 10:43:08 发布

阅读量108

点赞数

本文简单介绍Lucene.Net实现GroupBy效果的方法，与《Lucene.Net 按类别统计搜索结果数》一文类似。注意，这种使用方法很影响效率，特别是命中结果多的情况下。这段代码修正自2.3.1版本，其它版本可能会与此有差别。

改造方法仍然是修改IndexSearcher，这里不再修改类库，而是通过自己的代码来实现。

扩充IndexSearcher类
    /// <summary>
    /// 增加了GroupBy功能的IndexSearcher
    /// </summary>
    public class IndexSearcherExtension : IndexSearcher
    {
        /// <summary>
        /// 这里只用这一个构造函数，其它的就不再列出。
        /// </summary>
        /// <param name="path"></param>
        public IndexSearcherExtension(string path) : base(path) { }

        /// <summary>
        /// 增加GroupBy字段
        /// </summary>
        private string fieldName;
        /// <summary>
        /// 给TopDocCollectorExtension类的Collect方法使用。
        /// </summary>
        public string FieldName {
            get { return fieldName; }
        }
        /// <summary>
        /// 在调用Search方法前一定要调用该方法。
        /// </summary>
        /// <param name="fieldName"></param>
        public void GroupBy(string fieldName) {
            this.fieldName = fieldName;
        }
        /// <summary>
        /// 重写Seach方法，使其能调用构造好的方法。
        /// </summary>
        /// <param name="weight"></param>
        /// <param name="filter"></param>
        /// <param name="nDocs"></param>
        /// <returns></returns>
        public override TopDocs Search(Weight weight, Filter filter, int nDocs)
        {
            if (nDocs <= 0)
                // null might be returned from hq.top() below.
                throw new System.ArgumentException("nDocs must be > 0");

            TopDocCollectorExtension collector = new TopDocCollectorExtension(nDocs, this);
            Search(weight, filter, collector);
            return collector.TopDocs();
        }
    }

实现与 HitQueue类完全一致，只因为这里无法使用类库提供的构造函数
    /// <summary>
    /// 实现与 HitQueue类完全一致，只因为这里无法使用类库提供的构造函数
    /// </summary>
    public class HitQueueExtension : PriorityQueue
    {
        internal HitQueueExtension(int size)
        {
            Initialize(size);
        }

        public override bool LessThan(System.Object a, System.Object b)
        {
            ScoreDoc hitA = (ScoreDoc) a;
            ScoreDoc hitB = (ScoreDoc) b;
            if (hitA.score == hitB.score)
                return hitA.doc > hitB.doc;
            else
                return hitA.score < hitB.score;
        }
    }

     /// <summary>
     /// 增加新的TopDocCollector类，无法直接继承TopDocCollector
     /// </summary>
     public class TopDocCollectorExtension : HitCollector
    {
         private ScoreDoc reusableSD;

         internal int totalHits;
         internal PriorityQueue hq;

         /// <summary> Construct to collect a given number of hits. </summary>
         /// <param name="numHits"> the maximum number of hits to collect
         /// </param>
         public TopDocCollectorExtension( int numHits)
            : this (numHits, new HitQueueExtension(numHits))
        {
        }
         /// <summary>
         /// 注入IndexSearcherExtension对象
         /// </summary>
         private IndexSearcherExtension searcher;
         /// <summary>
         /// 构造函数注入对象
         /// </summary>
         /// <param name="numHits"></param>
         /// <param name="searcher"></param>
         public TopDocCollectorExtension( int numHits, IndexSearcherExtension searcher)
            : this (numHits)
        {
             this .searcher = searcher;
        }

         internal TopDocCollectorExtension( int numHits, PriorityQueue hq)
        {
             this .hq = hq;
        }

         /// <summary>
         /// 临时数据，用于排重
         /// </summary>
         private Dictionary < int , int > dict = new Dictionary < int , int > ();
         // javadoc inherited
         public override void Collect( int doc, float score)
        {
             if (score > 0.0f )
            {
                 // 排重算法
                 if ( ! string .IsNullOrEmpty(searcher.FieldName))
                {
                    IndexReader reader = searcher.GetIndexReader();
                    Document docment = reader.Document(doc);
                     string value = docment.Get(searcher.FieldName).Trim();
                     string value1 = string .Empty;
                     string value2 = string .Empty;
                     int len = value.Length;
                     int len1 = ( int )Math.Ceiling(len / 2.0f );
                     int len2 = len - len1;
                     int hash1 = value.Substring( 0 , len1).GetHashCode();
                     int hash2 = value.Substring(len1, len2).GetHashCode();
                     if ( ! (dict.ContainsKey(hash1) && dict.ContainsValue(hash2)))
                        dict.Add(hash1, hash2);
                     else
                         return ;
                }

                totalHits ++ ;
                 if (reusableSD == null )
                {
                    reusableSD = new ScoreDoc(doc, score);
                }
                 else if (score >= reusableSD.score)
                {
                     // reusableSD holds the last "rejected" entry, so, if
                     // this new score is not better than that, there's no
                     // need to try inserting it
                    reusableSD.doc = doc;
                    reusableSD.score = score;
                }
                 else
                {
                     return ;
                }
                reusableSD = (ScoreDoc)hq.InsertWithOverflow(reusableSD);
            }
        }

         /// <summary> The total number of documents that matched this query. </summary>
         public virtual int GetTotalHits()
        {
             return totalHits;
        }

         /// <summary> The top-scoring hits. </summary>
         public virtual TopDocs TopDocs()
        {
            ScoreDoc[] scoreDocs = new ScoreDoc[hq.Size()];
             for ( int i = hq.Size() - 1 ; i >= 0 ; i -- )
                 // put docs in array
                scoreDocs[i] = (ScoreDoc)hq.Pop();

             float maxScore = (totalHits == 0 ) ? System.Single.NegativeInfinity : scoreDocs[ 0 ].score;

             return new TopDocs(totalHits, scoreDocs, maxScore);
        }
    }

OK生产者完成了，下面看看消费者怎么搞。

         static void Main( string [] args)
        {
            IndexWriter writer = new IndexWriter( " e:\\index " , new StandardAnalyzer(), true );
            Document doc = new Document();
            doc.Add( new Field( " field " , " query value! " , Field.Store.YES, Field.Index.TOKENIZED));
            writer.AddDocument(doc);
            writer.AddDocument(doc);
            writer.AddDocument(doc);
            writer.Close();

            IndexSearcherExtension searcher = new IndexSearcherExtension( " e:\\index " );
            searcher.GroupBy( " field " );
            Query q = new QueryParser( " field " , new StandardAnalyzer())
                .Parse( " query " );
            Hits docs = searcher.Search(q);
             for ( int i = 0 ; i < docs.Length(); i ++ )
            {
                Console.WriteLine(docs.Doc(i).Get( " field " ));
            }
            searcher.Close();

            Console.ReadKey();
        }

添加了三个相同的文档，结果只查询到一个结果，从而达到了目的。这段修改比较简单，应该还可以设计出更加高效的算法。好长时间没写博客有些生疏了～～！