背景:在一个项目中搜索产品库,需要同一公司只显示一个产品的过滤功能.由于为了性能项目已经采用lucene.Net全文搜索架构,因此要"同一公司一个产品"的过滤功能就只能在lucene搜索上下功夫了.
本文是在Lucene.Net 2.9.2的源码里改的,下面开始通过源码来介绍,
一 首先修改IndexSearcher.cs文件,在此文件中增加 groupby排序的字段属性"FieldName",同时增加给此字段赋值的方法 GroupBy(string FieldName),注意应用时,在调用Search方法之前先调用GroupBy方法.
IndexSearcher.cs添加如下代码
/// <summary>
/// 增加GroupBy字段
/// </summary>
private string fieldName;
/// <summary>
/// 给TopDocCollector类的Collect方法使用。
/// </summary>
public string FieldName
{
get { return fieldName; }
}
/// <summary>
/// 在调用Search方法前一定要调用该方法。
/// </summary>
/// <param name="fieldName"></param>
public void GroupBy(string fieldName)
{
this.fieldName = fieldName;
}
二 TopFieldDocCollector.cs 该类的新增一个TopFieldDocCollector初始化方法,在新方法的增加IndexSearcher,
然后修改Collect方法,下面开始是排序效果的关键了
先定义一个泛型,用于保存groupby字段的值,在collect方法中去判断值有没有存在dict ,如果存在则跳转到下一条.
/// <summary>
/// 注入IndexSearcherExtension对象
/// </summary>
private IndexSearcher searcher;
/// <summary>
/// 构造函数注入对象
/// </summary>
/// <param name="numHits"></param>
/// <param name="searcher"></param>
public TopFieldDocCollector(IndexReader reader, Sort sort, int numHits,IndexSearcher searcher)
: base(new FieldSortedHitQueue(reader, sort.fields, numHits))
{
this.searcher = searcher;
}
/// <summary>
/// 临时数据,用于排重
/// </summary>
private Dictionary<int, int> dict = new Dictionary<int, int>();
/// <summary>
/// 临时数据,用于排重
/// </summary>
private Dictionary<int, int> dict = new Dictionary<int, int>();
// javadoc inherited
public override void Collect(int doc, float score)
{
if (score > 0.0f)
{
//排重算法
if (!string.IsNullOrEmpty(searcher.FieldName))
{
IndexReader reader = searcher.GetIndexReader();
Document docment = reader.Document(doc);
string value = docment.Get(searcher.FieldName).Trim();
if (!dict.ContainsKey(int.Parse(value)))
{
dict.Add(int.Parse(value), int.Parse(value));
}
else
{
return;
}
}
totalHits++;
if (reusableFD == null)
reusableFD = new FieldDoc(doc, score);
else
{
// Whereas TopScoreDocCollector can skip this if the
// score is not competitive, we cannot because the
// comparators in the FieldSortedHitQueue.lessThan
// aren't in general congruent with "higher score
// wins"
reusableFD.score = score;
reusableFD.doc = doc;
}
reusableFD = (FieldDoc) hq.InsertWithOverflow(reusableFD);
}
}
三 修改IndexSearcher.cs中将原来调用TopFieldDocCollector(IndexReader reader, Sort sort, int numHits)初始化的方法改为
新方法 (IndexReader reader, Sort sort, int numHits,IndexSearcher searcher)
四 测试
static void Main(string[] args)
{
Lucene.Net.Analysis.Analyzer analyzer = null;
analyzer = new PanGuAnalyzer(); //这里用了盘古分词的方法
PanGu.Segment.Init(@"E:\工作\LED\Project\demo\Lucene.Net_2_9\Lucene.Net_2_9_2\PanGu.xml");
IndexWriter writer = new IndexWriter("e:\\index", analyzer, true);
Document doc = new Document();
doc.Add(new Field("pro_Name", "LED节能灯", Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("pro_Mem_ID", "61", Field.Store.YES, Field.Index.NO));
doc.Add(new Field("pro_Attr", ",61:12,13:led,14:129,", Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("pro_price", "100", Field.Store.YES, Field.Index.UN_TOKENIZED));
Document doc2 = new Document();
doc2.Add(new Field("pro_Name", "LED日光灯", Field.Store.YES, Field.Index.ANALYZED));
doc2.Add(new Field("pro_Mem_ID", "62", Field.Store.YES, Field.Index.NO));
doc2.Add(new Field("pro_Attr", ",61:12,13:led,14:128,", Field.Store.YES, Field.Index.ANALYZED));
doc2.Add(new Field("pro_price", "200", Field.Store.YES, Field.Index.UN_TOKENIZED));
Document doc3 = new Document();
doc3.Add(new Field("pro_Name", "LED灯", Field.Store.YES, Field.Index.ANALYZED));
doc3.Add(new Field("pro_Mem_ID", "63", Field.Store.YES, Field.Index.NO));
doc3.Add(new Field("pro_Attr", ",61:12,", Field.Store.YES, Field.Index.ANALYZED));
doc3.Add(new Field("pro_price", "220", Field.Store.YES, Field.Index.UN_TOKENIZED));
writer.AddDocument(doc);
writer.AddDocument(doc2);
writer.AddDocument(doc3);
writer.Close();
IndexSearcher searcher = new IndexSearcher("e:\\index");
BooleanQuery boolQuery = new BooleanQuery();
string queryString = "LED";
if (queryString != null && queryString != string.Empty && queryString != "")
{
boolQuery.Add(MultiFieldQueryParser.Parse(new string[] { queryString }, new string[] { "pro_Name" }, analyzer), BooleanClause.Occur.SHOULD);
}
searcher.GroupBy("pro_Mem_ID");
Sort sort = new Sort();
SortField f2 = new SortField("pro_price", SortField.FLOAT, false);
sort.SetSort(new SortField[]{f2});
Hits docs = searcher.Search(boolQuery,null, sort);
for (int i = 0; i < docs.Length(); i++)
{
Console.WriteLine(docs.Doc(i).Get("pro_Name") + "---" + docs.Doc(i).Get("pro_Mem_ID") + "--" + docs.Doc(i).Get("pro_Attr"));
}
searcher.Close();
Console.ReadKey();
}
五 小结 关键在于TopFieldDocCollector.cs中的collect方法,在这个方法中大家也以可加入自定义过滤规则,规则的参数可以通过IndexSearch类中初始化.