Tag图是采用了Tag作为文章管理工具的网站经常需要呈现的一种视图。利用Lucene的优异性能,可以出色的完成这一功能。
生成一个Tag图,首先需要知道用于一共使用了哪些Tag,其次需要知道每个Tag被使用的次数。
对于这两个功能,都可以使用Lucene.Index.IndexReader.Terms方法。这个方法返回索引目录下所有Term,以及他们在全部文档中被使用的次数。这就为我们生成Tag提供了必要的基础。但是Terms方法返回的TermEnum的排序算法是按照FieldName,text的方式排序的,而不是按照docfreq排序的,所以需要还实现一个排序算法。
首先是索引的结构。我设计了如下的索引结构:
docurl:文档的url
contents:文档的内容,以便全文索引
doctags:文档相关的所有tags.tag以空格或逗号作为分割,可以使用单独的Analyzer进行解析。可以参考Analyzer以及PerFieldAnalyzerWrapper两个类。
排序算法,使用一个链表作为保存Tag的形式。它的两个方法GetList(int top)和Top(int freq)可以帮助我们设定Tag图中需要包含的Tag。TermFreq是每个Tag的数据内容。TermFreq.term是Tag的内容。TermFreq.freq是被使用的次数,这样就可以设定Tag的显示样式了。链表通过一个SortedList作为帮助信息,以便提高排序的效率。经过测试,这个排序算法对200M的TermFreq只需要11秒的时间。
2 {
3 public string term;
4 public int freq = 0 ;
5 }
6 internal class TermFreqCompare : System.Collections.IComparer
7 {
8 #region IComparer 成员
9
10 public int Compare( object x, object y)
11 {
12 TermFreq f1 = x as TermFreq;
13 TermFreq f2 = y as TermFreq;
14 int compareResult = f1.freq.CompareTo(f2.freq);
15 // if(compareResult==0) return f2.term.CompareTo(f1.term);
16 return compareResult;
17 }
18
19 #endregion
20
21 }
22 internal class TermFreqSortedList
23 {
24 private Element root;
25 private System.Collections.IComparer comparer;
26 private System.Collections.SortedList list;
27 internal class Element
28 {
29 public Element prev;
30 public Element next;
31 public TermFreq current;
32 }
33 public TermFreqSortedList(System.Collections.IComparer comparer)
34 {
35 root = new Element();
36 root.current = new TermFreq();
37 this .comparer = comparer;
38 list = new System.Collections.SortedList();
39 }
40 private Element GetStartElement( int freq)
41 {
42 Element ele = null ;
43 if (list.ContainsKey(freq))
44 {
45 ele = list[freq] as Element;
46 }
47 else
48 {
49 list.Add(freq, null );
50 int index = list.IndexOfKey(freq) - 1 ;
51 if (index < 0 ) ele = list[ 0 ] as Element;
52 else ele = list[index] as Element;
53 }
54 return ele;
55 }
56 public void Add(TermFreq o)
57 {
58 Element ele = GetStartElement(o.freq);
59 if (ele == null ) ele = root;
60 Element oEle = new Element();
61 oEle.current = o;
62 list[oEle.current.freq] = oEle;
63 while (ele != null )
64 {
65 int compareResult = comparer.Compare(ele.current,oEle.current);
66 if (compareResult > 0 )
67 {
68 if (ele.next == null )
69 {
70 ele.next = oEle;
71 oEle.prev = ele;
72 break ;
73 }
74 else if (comparer.Compare(ele.next.current,oEle.current) < 0 )
75 {
76 ele.next.prev = oEle;
77 oEle.next = ele.next;
78 ele.next = oEle;
79 oEle.prev = ele;
80 break ;
81 }
82 else
83 {
84 ele = ele.next;
85 continue ;
86 }
87 }
88 else if (compareResult < 0 )
89 {
90 if (ele.prev == null )
91 {
92 ele.prev = oEle;
93 oEle.next = ele;
94 root = oEle;
95 break ;
96 }
97 else if (comparer.Compare(ele.prev.current,oEle.current) > 0 )
98 {
99 ele.prev.next = oEle;
100 oEle.prev = ele.prev;
101
102 ele.prev = oEle;
103 oEle.next = ele;
104 break ;
105 }
106 else
107 {
108 ele = ele.prev;
109 continue ;
110 }
111 }
112 if (ele.prev != null )
113 {
114 ele.prev.next = oEle;
115 oEle.prev = ele.prev;
116 }
117 else
118 {
119 root = oEle;
120 }
121 oEle.next = ele;
122 ele.prev = oEle;
123 break ;
124 }
125 }
126 public System.Collections.ArrayList GetList( int top)
127 {
128 System.Collections.ArrayList list = new System.Collections.ArrayList();
129 Element ele = root;
130 int i = 0 ;
131 while ((i ++ ) < top)
132 {
133 list.Add(ele.current);
134 if (ele.next == null )
135 {
136 return list;
137 }
138 ele = ele.next;
139 }
140 return list;
141 }
142 public System.Collections.ArrayList Top( int freq)
143 {
144 System.Collections.ArrayList list = new System.Collections.ArrayList();
145 Element ele = root;
146 while (ele.current.freq >= freq)
147 {
148 list.Add(ele.current);;
149 if (ele.next == null )
150 return list;
151 ele = ele.next;
152 }
153 return list;
154 }
155 }
文档生成的代码:
2 doc.Add(Field.Keyword( " docurl " , docurl));
3 doc.Add(Field.Text( " contents " ,contents));
4 // storeTermVector==true.这样我们以后就可以通过TermFreqVector来访问tag在每个文档中被标注的次数了,以便生成单个文档的Tag图
5 doc.Add(Field.Text( " doctags " , reader, true ));
测试代码:
2 TermFreqSortedList list = new TermFreqSortedList( new TermFreqCompare());
3
4 while (enu.Next())
5 {
6 Lucene.Net.Index.Term t = enu.Term();
7
8 TermFreq f = new TermFreq();
9 f.freq = enu.DocFreq();
10 f.term = t.Text();
11 list.Add(f);
12 }
13 for (System.Collections.IEnumerator ienu = list.GetList( 5 ).GetEnumerator();ienu.MoveNext();)
14 {
15 TermFreq ff = ienu.Current as TermFreq;
16
17 Console.WriteLine( string .Format( " Term:{0}.\t\t\tDocFreq:{1} " ,
18 ff.term,
19 ff.freq));
20 }
21 for (System.Collections.IEnumerator ienu = list.Top( 3 ).GetEnumerator();ienu.MoveNext();)
22 {
23 TermFreq ff = ienu.Current as TermFreq;
24
25 Console.WriteLine( string .Format( " Term:{0}.\t\t\tDocFreq:{1} " ,
26 ff.term,
27 ff.freq));
28 }