1、准备工作
下载lucene 3.6.1 : http://lucene.apache.org/
下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug)
下载solr 3.6.1: http://lucene.apache.org/solr/(编译IK Analyzer时需引用包)
OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。
2、从Oracle数据库中取数据创建索引(使用IK分词)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
package
lucene.util;
import
org.apache.lucene.index.IndexWriter;
import
org.apache.lucene.index.IndexWriterConfig;
import
org.apache.lucene.index.CorruptIndexException;
import
org.apache.lucene.store.FSDirectory;
import
org.apache.lucene.store.Directory;
import
org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.analysis.standard.StandardAnalyzer;
import
org.apache.lucene.util.Version;
import
org.apache.lucene.document.Document;
import
org.apache.lucene.document.Field;
import
org.wltea.analyzer.lucene.IKAnalyzer;
import
java.sql.Connection;
import
java.io.File;
import
java.io.IOException;
import
java.util.ArrayList;
import
java.util.Date;
import
modules.gk.Gk_info;
import
modules.gk.Gk_infoSub;
import
web.sys.Globals;
import
web.db.DBConnector;
import
web.db.ObjectCtl;
import
web.util.StringUtil;
//Wizzer.cn
public
class
LuceneIndex {
IndexWriter writer =
null
;
FSDirectory dir =
null
;
boolean
create =
true
;
public
void
init() {
long
a1 = System.currentTimeMillis();
System.out.println(
"[Lucene 开始执行:"
+
new
Date() +
"]"
);
Connection con = DBConnector.getconecttion();
//取得一个数据库连接
try
{
final
File docDir =
new
File(Globals.SYS_COM_CONFIG.get(
"sys.index.path"
).toString());
//E:\lucene
if
(!docDir.exists()) {
docDir.mkdirs();
}
String cr = Globals.SYS_COM_CONFIG.get(
"sys.index.create"
).toString();
//true or false
if
(
"false"
.equals(cr.toLowerCase())) {
create =
false
;
}
Directory dir = FSDirectory.open(docDir);
// Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
Analyzer analyzer =
new
IKAnalyzer(
true
);
IndexWriterConfig iwc =
new
IndexWriterConfig(Version.LUCENE_36, analyzer);
if
(create) {
// Create a new index in the directory, removing any
// previously indexed documents:
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
}
else
{
// Add new documents to an existing index:
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer =
new
IndexWriter(dir, iwc);
String sql =
"SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 "
;
int
rowCount = ObjectCtl.getRowCount(con, sql);
int
pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get(
"sys.index.size"
).toString());
//每页记录数
int
pages = (rowCount -
1
) / pageSize +
1
;
//计算总页数
ArrayList list =
null
;
Gk_infoSub gk =
null
;
for
(
int
i =
1
; i < pages+
1
; i++) {
long
a = System.currentTimeMillis();
list = ObjectCtl.listPage(con, sql, i, pageSize,
new
Gk_infoSub());
for
(
int
j =
0
; j < list.size(); j++) {
gk = (Gk_infoSub) list.get(j);
Document doc =
new
Document();
doc.add(
new
Field(
"indexno"
, StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
//主键不分词
doc.add(
new
Field(
"title"
, StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));
doc.add(
new
Field(
"describes"
, StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));
doc.add(
new
Field(
"pdate"
, StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
//日期不分词
doc.add(
new
Field(
"keywords"
, StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
ObjectCtl.executeUpdateBySql(con,
"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"
+gk.getIndexno()+
"'"
);
//更新已索引状态
}
long
b = System.currentTimeMillis();
long
c = b - a;
System.out.println(
"[Lucene "
+ rowCount +
"条,"
+ pages +
"页,第"
+ i +
"页花费时间:"
+ c +
"毫秒]"
);
}
writer.commit();
}
catch
(Exception e) {
e.printStackTrace();
}
finally
{
DBConnector.freecon(con);
//释放数据库连接
try
{
if
(writer !=
null
) {
writer.close();
}
}
catch
(CorruptIndexException e) {
e.printStackTrace();
}
catch
(IOException e) {
e.printStackTrace();
}
finally
{
try
{
if
(dir !=
null
&& IndexWriter.isLocked(dir)) {
IndexWriter.unlock(dir);
//注意解锁
}
}
catch
(IOException e) {
e.printStackTrace();
}
}
}
long
b1 = System.currentTimeMillis();
long
c1 = b1 - a1;
System.out.println(
"[Lucene 执行完毕,花费时间:"
+ c1 +
"毫秒,完成时间:"
+
new
Date() +
"]"
);
}
}
|
3、单字段查询以及多字段分页查询高亮显示
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
|
package
lucene.util;
import
org.apache.lucene.store.FSDirectory;
import
org.apache.lucene.store.Directory;
import
org.apache.lucene.search.*;
import
org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import
org.apache.lucene.search.highlight.Highlighter;
import
org.apache.lucene.search.highlight.SimpleFragmenter;
import
org.apache.lucene.search.highlight.QueryScorer;
import
org.apache.lucene.queryParser.QueryParser;
import
org.apache.lucene.queryParser.MultiFieldQueryParser;
import
org.apache.lucene.analysis.TokenStream;
import
org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.analysis.KeywordAnalyzer;
import
org.apache.lucene.document.Document;
import
org.apache.lucene.index.IndexReader;
import
org.apache.lucene.index.Term;
import
org.apache.lucene.util.Version;
import
modules.gk.Gk_infoSub;
import
java.util.ArrayList;
import
java.io.File;
import
java.io.StringReader;
import
java.lang.reflect.Constructor;
import
web.util.StringUtil;
import
web.sys.Globals;
import
org.wltea.analyzer.lucene.IKAnalyzer;
//Wizzer.cn
public
class
LuceneQuery {
private
static
String indexPath;
// 索引生成的目录
private
int
rowCount;
// 记录数
private
int
pages;
// 总页数
private
int
currentPage;
// 当前页数
private
int
pageSize;
//每页记录数
public
LuceneQuery() {
this
.indexPath = Globals.SYS_COM_CONFIG.get(
"sys.index.path"
).toString();
}
public
int
getRowCount() {
return
rowCount;
}
public
int
getPages() {
return
pages;
}
public
int
getPageSize() {
return
pageSize;
}
public
int
getCurrentPage() {
return
currentPage;
}
/**
* 函数功能:根据字段查询索引
*/
public
ArrayList queryIndexTitle(String keyWord,
int
curpage,
int
pageSize) {
ArrayList list =
new
ArrayList();
try
{
if
(curpage <=
0
) {
curpage =
1
;
}
if
(pageSize <=
0
) {
pageSize =
20
;
}
this
.pageSize = pageSize;
//每页记录数
this
.currentPage = curpage;
//当前页
int
start = (curpage -
1
) * pageSize;
Directory dir = FSDirectory.open(
new
File(indexPath));
IndexReader reader = IndexReader.open(dir);
IndexSearcher searcher =
new
IndexSearcher(reader);
Analyzer analyzer =
new
IKAnalyzer(
true
);
QueryParser queryParser =
new
QueryParser(Version.LUCENE_36,
"title"
, analyzer);
queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = queryParser.parse(keyWord);
int
hm = start + pageSize;
TopScoreDocCollector res = TopScoreDocCollector.create(hm,
false
);
searcher.search(query, res);
SimpleHTMLFormatter simpleHTMLFormatter =
new
SimpleHTMLFormatter(
"<span style='color:red'>"
,
"</span>"
);
Highlighter highlighter =
new
Highlighter(simpleHTMLFormatter,
new
QueryScorer(query));
this
.rowCount = res.getTotalHits();
this
.pages = (rowCount -
1
) / pageSize +
1
;
//计算总页数
TopDocs tds = res.topDocs(start, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
for
(
int
i =
0
; i < sd.length; i++) {
Document hitDoc = reader.document(sd[i].doc);
list.add(createObj(hitDoc, analyzer, highlighter));
}
}
catch
(Exception e) {
e.printStackTrace();
}
return
list;
}
/**
* 函数功能:根据字段查询索引
*/
public
ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword,
int
curpage,
int
pageSize) {
ArrayList list =
new
ArrayList();
try
{
if
(curpage <=
0
) {
curpage =
1
;
}
if
(pageSize <=
0
) {
pageSize =
20
;
}
this
.pageSize = pageSize;
//每页记录数
this
.currentPage = curpage;
//当前页
int
start = (curpage -
1
) * pageSize;
Directory dir = FSDirectory.open(
new
File(indexPath));
IndexReader reader = IndexReader.open(dir);
IndexSearcher searcher =
new
IndexSearcher(reader);
BooleanQuery bQuery =
new
BooleanQuery();
//组合查询
if
(!
""
.equals(allkeyword)) {
//包含全部关键词
KeywordAnalyzer analyzer =
new
KeywordAnalyzer();
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
//AND
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword,
new
String[]{
"title"
,
"describes"
,
"keywords"
}, flags, analyzer);
bQuery.add(query, BooleanClause.Occur.MUST);
//AND
}
if
(!
""
.equals(onekeyword)) {
//包含任意关键词
Analyzer analyzer =
new
IKAnalyzer(
true
);
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
//OR
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword,
new
String[]{
"title"
,
"describes"
,
"keywords"
}, flags, analyzer);
bQuery.add(query, BooleanClause.Occur.MUST);
//AND
}
if
(!
""
.equals(nokeyword)) {
//排除关键词
Analyzer analyzer =
new
IKAnalyzer(
true
);
BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
//NOT
Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword,
new
String[]{
"title"
,
"describes"
,
"keywords"
}, flags, analyzer);
bQuery.add(query, BooleanClause.Occur.MUST_NOT);
//AND
}
int
hm = start + pageSize;
TopScoreDocCollector res = TopScoreDocCollector.create(hm,
false
);
searcher.search(bQuery, res);
SimpleHTMLFormatter simpleHTMLFormatter =
new
SimpleHTMLFormatter(
"<span style='color:red'>"
,
"</span>"
);
Highlighter highlighter =
new
Highlighter(simpleHTMLFormatter,
new
QueryScorer(bQuery));
this
.rowCount = res.getTotalHits();
this
.pages = (rowCount -
1
) / pageSize +
1
;
//计算总页数
System.out.println(
"rowCount:"
+ rowCount);
TopDocs tds = res.topDocs(start, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
Analyzer analyzer =
new
IKAnalyzer();
for
(
int
i =
0
; i < sd.length; i++) {
Document hitDoc = reader.document(sd[i].doc);
list.add(createObj(hitDoc, analyzer, highlighter));
}
}
catch
(Exception e) {
e.printStackTrace();
}
return
list;
}
/**
* 创建返回对象(高亮)
*/
private
synchronized
static
Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {
Gk_infoSub gk =
new
Gk_infoSub();
try
{
if
(doc !=
null
) {
gk.setIndexno(StringUtil.null2String(doc.get(
"indexno"
)));
gk.setPdate(StringUtil.null2String(doc.get(
"pdate"
)));
String title = StringUtil.null2String(doc.get(
"title"
));
gk.setTitle(title);
if
(!
""
.equals(title)) {
highlighter.setTextFragmenter(
new
SimpleFragmenter(title.length()));
TokenStream tk = analyzer.tokenStream(
"title"
,
new
StringReader(title));
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));
if
(!
""
.equals(htext)) {
gk.setTitle(htext);
}
}
String keywords = StringUtil.null2String(doc.get(
"keywords"
));
gk.setKeywords(keywords);
if
(!
""
.equals(keywords)) {
highlighter.setTextFragmenter(
new
SimpleFragmenter(keywords.length()));
TokenStream tk = analyzer.tokenStream(
"keywords"
,
new
StringReader(keywords));
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));
if
(!
""
.equals(htext)) {
gk.setKeywords(htext);
}
}
String describes = StringUtil.null2String(doc.get(
"describes"
));
gk.setDescribes(describes);
if
(!
""
.equals(describes)) {
highlighter.setTextFragmenter(
new
SimpleFragmenter(describes.length()));
TokenStream tk = analyzer.tokenStream(
"keywords"
,
new
StringReader(describes));
String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));
if
(!
""
.equals(htext)) {
gk.setDescribes(htext);
}
}
}
return
gk;
}
catch
(Exception e) {
e.printStackTrace();
return
null
;
}
finally
{
gk =
null
;
}
}
private
synchronized
static
Object createObj(Document doc) {
Gk_infoSub gk =
new
Gk_infoSub();
try
{
if
(doc !=
null
) {
gk.setIndexno(StringUtil.null2String(doc.get(
"indexno"
)));
gk.setPdate(StringUtil.null2String(doc.get(
"pdate"
)));
gk.setTitle(StringUtil.null2String(doc.get(
"title"
)));
gk.setKeywords(StringUtil.null2String(doc.get(
"keywords"
)));
gk.setDescribes(StringUtil.null2String(doc.get(
"describes"
)));
}
return
gk;
}
catch
(Exception e) {
e.printStackTrace();
return
null
;
}
finally
{
gk =
null
;
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
long
a = System.currentTimeMillis();
try
{
int
curpage = StringUtil.StringToInt(StringUtil.null2String(form.get(
"curpage"
)));
int
pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get(
"pagesize"
)));
String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get(
"title"
)));
LuceneQuery lu =
new
LuceneQuery();
form.addResult(
"list"
, lu.queryIndexTitle(title, curpage, pagesize));
form.addResult(
"curPage"
, lu.getCurrentPage());
form.addResult(
"pageSize"
, lu.getPageSize());
form.addResult(
"rowCount"
, lu.getRowCount());
form.addResult(
"pageCount"
, lu.getPages());
}
catch
(Exception e) {
e.printStackTrace();
}
long
b = System.currentTimeMillis();
long
c = b - a;
System.out.println(
"[搜索信息花费时间:"
+ c +
"毫秒]"
);
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
long
a = System.currentTimeMillis();
try
{
int
curpage = StringUtil.StringToInt(StringUtil.null2String(form.get(
"curpage"
)));
int
pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get(
"pagesize"
)));
String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get(
"allkeyword"
)));
String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get(
"onekeyword"
)));
String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get(
"nokeyword"
)));
LuceneQuery lu =
new
LuceneQuery();
form.addResult(
"list"
, lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));
form.addResult(
"curPage"
, lu.getCurrentPage());
form.addResult(
"pageSize"
, lu.getPageSize());
form.addResult(
"rowCount"
, lu.getRowCount());
form.addResult(
"pageCount"
, lu.getPages());
}
catch
(Exception e) {
e.printStackTrace();
}
long
b = System.currentTimeMillis();
long
c = b - a;
System.out.println(
"[高级检索花费时间:"
+ c +
"毫秒]"
);
|
4、Lucene通配符查询
1
2
3
4
5
6
7
8
9
|
BooleanQuery bQuery =
new
BooleanQuery();
//组合查询
if
(!
""
.equals(title)) {
WildcardQuery w1 =
new
WildcardQuery(
new
Term(
"title"
, title+
"*"
));
bQuery.add(w1, BooleanClause.Occur.MUST);
//AND
}
int
hm = start + pageSize;
TopScoreDocCollector res = TopScoreDocCollector.create(hm,
false
);
searcher.search(bQuery, res);
|
5、Lucene嵌套查询
实现SQL:(unitid like 'unitid%' and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
BooleanQuery bQuery =
new
BooleanQuery();
BooleanQuery b1 =
new
BooleanQuery();
WildcardQuery w1 =
new
WildcardQuery(
new
Term(
"unitid"
, unitid +
"*"
));
WildcardQuery w2 =
new
WildcardQuery(
new
Term(
"idml"
, id2 +
"*"
));
b1.add(w1, BooleanClause.Occur.MUST);
//AND
b1.add(w2, BooleanClause.Occur.MUST);
//AND
bQuery.add(b1, BooleanClause.Occur.SHOULD);
//OR
BooleanQuery b2 =
new
BooleanQuery();
WildcardQuery w3 =
new
WildcardQuery(
new
Term(
"tounitid"
, unitid +
"*"
));
WildcardQuery w4 =
new
WildcardQuery(
new
Term(
"tomlid"
, id2 +
"*"
));
WildcardQuery w5 =
new
WildcardQuery(
new
Term(
"tostate"
,
"1"
));
b2.add(w3, BooleanClause.Occur.MUST);
//AND
b2.add(w4, BooleanClause.Occur.MUST);
//AND
b2.add(w5, BooleanClause.Occur.MUST);
//AND
bQuery.add(b2, BooleanClause.Occur.SHOULD);
//OR
|
6、Lucene先根据时间排序后分页
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
int
hm = start + pageSize;
Sort sort =
new
Sort(
new
SortField(
"pdate"
, SortField.STRING,
true
));
TopScoreDocCollector res = TopScoreDocCollector.create(pageSize,
false
);
searcher.search(bQuery, res);
this
.rowCount = res.getTotalHits();
this
.pages = (rowCount -
1
) / pageSize +
1
;
//计算总页数
TopDocs tds =searcher.search(bQuery,rowCount,sort);
// res.topDocs(start, pageSize);
ScoreDoc[] sd = tds.scoreDocs;
System.out.println(
"rowCount:"
+ rowCount);
int
i=
0
;
for
(ScoreDoc scoreDoc : sd) {
i++;
if
(i<start){
continue
;
}
if
(i>hm){
break
;
}
Document doc = searcher.doc(scoreDoc.doc);
list.add(createObj(doc));
}
|