Lucene 3.6 中文分词、分页查询、高亮显示等

1、准备工作

下载lucene 3.6.1 : http://lucene.apache.org/ 

下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug) 

下载solr 3.6.1:  http://lucene.apache.org/solr/(编译IK Analyzer时需引用包) 

OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。

2、从Oracle数据库中取数据创建索引(使用IK分词)

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package lucene.util;
 
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.wltea.analyzer.lucene.IKAnalyzer;
 
import java.sql.Connection;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
 
import modules.gk.Gk_info;
import modules.gk.Gk_infoSub;
import web.sys.Globals;
import web.db.DBConnector;
import web.db.ObjectCtl;
import web.util.StringUtil;
//Wizzer.cn
public class LuceneIndex {
     IndexWriter writer =  null ;
     FSDirectory dir =  null ;
     boolean create =  true ;
 
     public void init() {
         long a1 = System.currentTimeMillis();
         System.out.println( "[Lucene 开始执行:" new Date() +  "]" );
         Connection con = DBConnector.getconecttion();  //取得一个数据库连接
         try {
             final File docDir =  new File(Globals.SYS_COM_CONFIG.get( "sys.index.path" ).toString()); //E:\lucene
             if (!docDir.exists()) {
                 docDir.mkdirs();
             }
             String cr = Globals.SYS_COM_CONFIG.get( "sys.index.create" ).toString(); //true or false
             if ( "false" .equals(cr.toLowerCase())) {
                 create =  false ;
             }
             Directory dir = FSDirectory.open(docDir);
//            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
             Analyzer analyzer =  new IKAnalyzer( true );
             IndexWriterConfig iwc =  new IndexWriterConfig(Version.LUCENE_36, analyzer);
             if (create) {
                 // Create a new index in the directory, removing any
                 // previously indexed documents:
                 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
             else {
                 // Add new documents to an existing index:
                 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
             }
             IndexWriter writer =  new IndexWriter(dir, iwc);
             String sql =  "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 " ;
             int rowCount = ObjectCtl.getRowCount(con, sql);
             int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get( "sys.index.size" ).toString());    //每页记录数
             int pages = (rowCount -  1 ) / pageSize +  1 //计算总页数
             ArrayList list =  null ;
             Gk_infoSub gk =  null ;
             for ( int i =  1 ; i < pages+ 1 ; i++) {
                 long a = System.currentTimeMillis();
                 list = ObjectCtl.listPage(con, sql, i, pageSize,  new Gk_infoSub());
                 for ( int j =  0 ; j < list.size(); j++) {
                     gk = (Gk_infoSub) list.get(j);
                     Document doc =  new Document();
                     doc.add( new Field( "indexno" , StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //主键不分词
                     doc.add( new Field( "title" , StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));
                     doc.add( new Field( "describes" , StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));
                     doc.add( new Field( "pdate" , StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); //日期不分词
                     doc.add( new Field( "keywords" , StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));
                     writer.addDocument(doc);
                     ObjectCtl.executeUpdateBySql(con, "UPDATE TABLEA SET SSTAG=1 WHERE indexno='" +gk.getIndexno()+ "'" ); //更新已索引状态
                 }
 
                 long b = System.currentTimeMillis();
                 long c = b - a;
                 System.out.println( "[Lucene " + rowCount +  "条," + pages +  "页,第" + i +  "页花费时间:" + c +  "毫秒]" );
             }
             writer.commit();
 
         catch (Exception e) {
             e.printStackTrace();
         finally {
             DBConnector.freecon(con);  //释放数据库连接
             try {
                 if (writer !=  null ) {
                     writer.close();
                 }
             catch (CorruptIndexException e) {
                 e.printStackTrace();
             catch (IOException e) {
                 e.printStackTrace();
             finally {
                 try {
                     if (dir !=  null && IndexWriter.isLocked(dir)) {
                         IndexWriter.unlock(dir); //注意解锁
                     }
                 catch (IOException e) {
                     e.printStackTrace();
                 }
             }
         }
         long b1 = System.currentTimeMillis();
         long c1 = b1 - a1;
         System.out.println( "[Lucene 执行完毕,花费时间:" + c1 +  "毫秒,完成时间:" new Date() +  "]" );
     }
}
 

3、单字段查询以及多字段分页查询高亮显示

 
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
package lucene.util;
 
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.Version;
import modules.gk.Gk_infoSub;
 
import java.util.ArrayList;
import java.io.File;
import java.io.StringReader;
import java.lang.reflect.Constructor;
 
import web.util.StringUtil;
import web.sys.Globals;
import org.wltea.analyzer.lucene.IKAnalyzer;
//Wizzer.cn
public class LuceneQuery {
     private static String indexPath; // 索引生成的目录
     private int rowCount; // 记录数
     private int pages; // 总页数
     private int currentPage; // 当前页数
     private int pageSize;    //每页记录数
 
     public LuceneQuery() {
         this .indexPath = Globals.SYS_COM_CONFIG.get( "sys.index.path" ).toString();
     }
 
     public int getRowCount() {
         return rowCount;
     }
 
     public int getPages() {
         return pages;
     }
 
     public int getPageSize() {
         return pageSize;
     }
 
     public int getCurrentPage() {
         return currentPage;
     }
 
     /**
      * 函数功能:根据字段查询索引
      */
     public ArrayList queryIndexTitle(String keyWord,  int curpage,  int pageSize) {
         ArrayList list =  new ArrayList();
         try {
             if (curpage <=  0 ) {
                 curpage =  1 ;
             }
             if (pageSize <=  0 ) {
                 pageSize =  20 ;
             }
             this .pageSize = pageSize;    //每页记录数
             this .currentPage = curpage;    //当前页
             int start = (curpage -  1 ) * pageSize;
             Directory dir = FSDirectory.open( new File(indexPath));
             IndexReader reader = IndexReader.open(dir);
             IndexSearcher searcher =  new IndexSearcher(reader);
             Analyzer analyzer =  new IKAnalyzer( true );
             QueryParser queryParser =  new QueryParser(Version.LUCENE_36,  "title" , analyzer);
             queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
             Query query = queryParser.parse(keyWord);
             int hm = start + pageSize;
             TopScoreDocCollector res = TopScoreDocCollector.create(hm,  false );
             searcher.search(query, res);
 
             SimpleHTMLFormatter simpleHTMLFormatter =  new SimpleHTMLFormatter( "<span style='color:red'>" "</span>" );
             Highlighter highlighter =  new Highlighter(simpleHTMLFormatter,  new QueryScorer(query));
             this .rowCount = res.getTotalHits();
             this .pages = (rowCount -  1 ) / pageSize +  1 //计算总页数
             TopDocs tds = res.topDocs(start, pageSize);
             ScoreDoc[] sd = tds.scoreDocs;
             for ( int i =  0 ; i < sd.length; i++) {
                 Document hitDoc = reader.document(sd[i].doc);
                 list.add(createObj(hitDoc, analyzer, highlighter));
             }
 
         catch (Exception e) {
             e.printStackTrace();
         }
 
         return list;
 
     }
     /**
      * 函数功能:根据字段查询索引
      */
     public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword,  int curpage,  int pageSize) {
         ArrayList list =  new ArrayList();
         try {
             if (curpage <=  0 ) {
                 curpage =  1 ;
             }
             if (pageSize <=  0 ) {
                 pageSize =  20 ;
             }
             this .pageSize = pageSize;    //每页记录数
             this .currentPage = curpage;    //当前页
             int start = (curpage -  1 ) * pageSize;
             Directory dir = FSDirectory.open( new File(indexPath));
             IndexReader reader = IndexReader.open(dir);
             IndexSearcher searcher =  new IndexSearcher(reader);
             BooleanQuery bQuery =  new BooleanQuery();   //组合查询
             if (! "" .equals(allkeyword)) { //包含全部关键词
                 KeywordAnalyzer analyzer =  new KeywordAnalyzer();
                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD}; //AND
                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword,  new String[]{ "title" "describes" "keywords" }, flags, analyzer);
                 bQuery.add(query, BooleanClause.Occur.MUST);   //AND
             }
             if (! "" .equals(onekeyword)) {  //包含任意关键词
                 Analyzer analyzer =  new IKAnalyzer( true );
                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD}; //OR
                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword,  new String[]{ "title" "describes" "keywords" }, flags, analyzer);
                 bQuery.add(query, BooleanClause.Occur.MUST);   //AND
             }
             if (! "" .equals(nokeyword)) {  //排除关键词
                 Analyzer analyzer =  new IKAnalyzer( true );
                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD}; //NOT
                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword,  new String[]{ "title" "describes" "keywords" }, flags, analyzer);
                 bQuery.add(query, BooleanClause.Occur.MUST_NOT);   //AND
 
             }
             int hm = start + pageSize;
             TopScoreDocCollector res = TopScoreDocCollector.create(hm,  false );
             searcher.search(bQuery, res);
             SimpleHTMLFormatter simpleHTMLFormatter =  new SimpleHTMLFormatter( "<span style='color:red'>" "</span>" );
             Highlighter highlighter =  new Highlighter(simpleHTMLFormatter,  new QueryScorer(bQuery));
             this .rowCount = res.getTotalHits();
             this .pages = (rowCount -  1 ) / pageSize +  1 //计算总页数
             System.out.println( "rowCount:" + rowCount);
             TopDocs tds = res.topDocs(start, pageSize);
             ScoreDoc[] sd = tds.scoreDocs;
             Analyzer analyzer =  new IKAnalyzer();
             for ( int i =  0 ; i < sd.length; i++) {
                 Document hitDoc = reader.document(sd[i].doc);
                 list.add(createObj(hitDoc, analyzer, highlighter));
             }
 
         catch (Exception e) {
             e.printStackTrace();
         }
 
         return list;
 
     }
 
     /**
      * 创建返回对象(高亮)
      */
 
     private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {
 
         Gk_infoSub gk =  new Gk_infoSub();
         try {
 
             if (doc !=  null ) {
                 gk.setIndexno(StringUtil.null2String(doc.get( "indexno" )));
                 gk.setPdate(StringUtil.null2String(doc.get( "pdate" )));
                 String title = StringUtil.null2String(doc.get( "title" ));
                 gk.setTitle(title);
                 if (! "" .equals(title)) {
                     highlighter.setTextFragmenter( new SimpleFragmenter(title.length()));
                     TokenStream tk = analyzer.tokenStream( "title" new StringReader(title));
                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));
                     if (! "" .equals(htext)) {
                         gk.setTitle(htext);
                     }
                 }
                 String keywords = StringUtil.null2String(doc.get( "keywords" ));
                 gk.setKeywords(keywords);
                 if (! "" .equals(keywords)) {
                     highlighter.setTextFragmenter( new SimpleFragmenter(keywords.length()));
                     TokenStream tk = analyzer.tokenStream( "keywords" new StringReader(keywords));
                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));
                     if (! "" .equals(htext)) {
                         gk.setKeywords(htext);
                     }
                 }
                 String describes = StringUtil.null2String(doc.get( "describes" ));
                 gk.setDescribes(describes);
                 if (! "" .equals(describes)) {
                     highlighter.setTextFragmenter( new SimpleFragmenter(describes.length()));
                     TokenStream tk = analyzer.tokenStream( "keywords" new StringReader(describes));
                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));
                     if (! "" .equals(htext)) {
                         gk.setDescribes(htext);
                     }
                 }
 
             }
             return gk;
         }
         catch (Exception e) {
 
             e.printStackTrace();
             return null ;
         }
         finally {
             gk =  null ;
         }
 
     }
 
     private synchronized static Object createObj(Document doc) {
 
         Gk_infoSub gk =  new Gk_infoSub();
         try {
 
             if (doc !=  null ) {
                 gk.setIndexno(StringUtil.null2String(doc.get( "indexno" )));
                 gk.setPdate(StringUtil.null2String(doc.get( "pdate" )));
                 gk.setTitle(StringUtil.null2String(doc.get( "title" )));
                 gk.setKeywords(StringUtil.null2String(doc.get( "keywords" )));
                 gk.setDescribes(StringUtil.null2String(doc.get( "describes" )));
             }
             return gk;
         }
         catch (Exception e) {
 
             e.printStackTrace();
             return null ;
         }
         finally {
             gk =  null ;
         }
 
     }
}
  单字段查询:
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
long a = System.currentTimeMillis();
try {
     int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get( "curpage" )));
     int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get( "pagesize" )));
     String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "title" )));
     LuceneQuery lu =  new LuceneQuery();
     form.addResult( "list" , lu.queryIndexTitle(title, curpage, pagesize));
     form.addResult( "curPage" , lu.getCurrentPage());
     form.addResult( "pageSize" , lu.getPageSize());
     form.addResult( "rowCount" , lu.getRowCount());
     form.addResult( "pageCount" , lu.getPages());
catch (Exception e) {
     e.printStackTrace();
}
long b = System.currentTimeMillis();
long c = b - a;
System.out.println( "[搜索信息花费时间:" + c +  "毫秒]" );
多字段查询:
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
long a = System.currentTimeMillis();
try {
     int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get( "curpage" )));
     int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get( "pagesize" )));
     String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "allkeyword" )));
     String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "onekeyword" )));
     String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get( "nokeyword" )));
     LuceneQuery lu =  new LuceneQuery();
     form.addResult( "list" , lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));
     form.addResult( "curPage" , lu.getCurrentPage());
     form.addResult( "pageSize" , lu.getPageSize());
     form.addResult( "rowCount" , lu.getRowCount());
     form.addResult( "pageCount" , lu.getPages());
catch (Exception e) {
     e.printStackTrace();
}
long b = System.currentTimeMillis();
long c = b - a;
System.out.println( "[高级检索花费时间:" + c +  "毫秒]" );

4、Lucene通配符查询

?
1
2
3
4
5
6
7
8
9
BooleanQuery bQuery =  new BooleanQuery();   //组合查询
if (! "" .equals(title)) {
     WildcardQuery w1 =  new WildcardQuery( new Term( "title" , title+  "*" ));
 
     bQuery.add(w1, BooleanClause.Occur.MUST);   //AND
}
int hm = start + pageSize;
TopScoreDocCollector res = TopScoreDocCollector.create(hm,  false );
searcher.search(bQuery, res);
 

5、Lucene嵌套查询

实现SQL:(unitid like 'unitid%'  and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
BooleanQuery bQuery =  new BooleanQuery();
BooleanQuery b1 =  new BooleanQuery();
WildcardQuery w1 =  new WildcardQuery( new Term( "unitid" , unitid +  "*" ));
WildcardQuery w2 =  new WildcardQuery( new Term( "idml" , id2 +  "*" ));
b1.add(w1, BooleanClause.Occur.MUST); //AND
b1.add(w2, BooleanClause.Occur.MUST); //AND
bQuery.add(b1, BooleanClause.Occur.SHOULD); //OR
BooleanQuery b2 =  new BooleanQuery();
WildcardQuery w3 =  new WildcardQuery( new Term( "tounitid" , unitid +  "*" ));
WildcardQuery w4 =  new WildcardQuery( new Term( "tomlid" , id2 +  "*" ));
WildcardQuery w5 =  new WildcardQuery( new Term( "tostate" "1" ));
b2.add(w3, BooleanClause.Occur.MUST); //AND
b2.add(w4, BooleanClause.Occur.MUST); //AND
b2.add(w5, BooleanClause.Occur.MUST); //AND
bQuery.add(b2, BooleanClause.Occur.SHOULD); //OR

6、Lucene先根据时间排序后分页

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
int hm = start + pageSize;
Sort sort =  new Sort( new SortField( "pdate" , SortField.STRING,  true ));
TopScoreDocCollector res = TopScoreDocCollector.create(pageSize,  false );
searcher.search(bQuery, res);
this .rowCount = res.getTotalHits();
this .pages = (rowCount -  1 ) / pageSize +  1 //计算总页数
TopDocs tds =searcher.search(bQuery,rowCount,sort); // res.topDocs(start, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
System.out.println( "rowCount:" + rowCount);
int i= 0 ;
for (ScoreDoc scoreDoc : sd) {
     i++;
     if (i<start){
         continue ;
     }
     if (i>hm){
         break ;
     }
     Document doc = searcher.doc(scoreDoc.doc);
     list.add(createObj(doc));
}
这个效率不高,正常的做饭是创建索引的时候进行排序,之后使用分页方法,不要这样进行2次查询。
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值