下面是今天初次看的一点Lucene,整理出笔记备忘~~
下载这个文件lucene-2.4.0.zip到任意目录并解压,得到lucene-core-2.4.0.jar。
这里有Lucene的API:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//index.html
注意:Lucene 2.0以上使用的JDK版本不能低于1.5.0。
在My Eclipse建立一个Java Project,可命名为"Hello Lucene"。
设置Project->Properties->Java Compiler, 选中"Enable project specific settings"。
在项目中建立一个Package,命名为"test",并添加一个名为"HelloWorld.java"的Class。
在项目中新建Folder,命名为lib。把lucene-core-2.4.0.jar用鼠标拖到lib中。
设置Project->Properties->Java Build Path,在Library选项卡中单击"Add JARs"按钮,把项目中的lucene-core-2.4.0.jar添加进去。使用项目内部的lucene-core-2.4.0.jar 就不用担心外部环境变量的影响了。
然后编写HelloWorld.java (直接拷贝代码了……)
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileReader;
6 import java.io.IOException;
7 import java.io.InputStreamReader;
8 import java.io.Reader;
9 import java.util.Date;
10
11 import org.apache.lucene.analysis.Analyzer;
12 import org.apache.lucene.analysis.standard.StandardAnalyzer;
13 import org.apache.lucene.document.Document;
14 import org.apache.lucene.document.Field;
15 import org.apache.lucene.index.CorruptIndexException;
16 import org.apache.lucene.index.IndexWriter;
17 import org.apache.lucene.index.Term;
18 import org.apache.lucene.queryParser.ParseException;
19 import org.apache.lucene.queryParser.QueryParser;
20 import org.apache.lucene.search.BooleanClause;
21 import org.apache.lucene.search.BooleanQuery;
22 import org.apache.lucene.search.IndexSearcher;
23 import org.apache.lucene.search.PhraseQuery;
24 import org.apache.lucene.search.Query;
25 import org.apache.lucene.search.ScoreDoc;
26 import org.apache.lucene.search.TermQuery;
27 import org.apache.lucene.search.TopDocCollector;
28 import org.apache.lucene.search.WildcardQuery;
29 import org.apache.lucene.store.FSDirectory;
30 import org.apache.lucene.store.LockObtainFailedException;
31
32 /** */ /**
33 * @version Lucene 2.4.0
34 * @author hj
35 * @date 2009.1.8
36 * @see http://www.ibm.com/developerworks/cn/java/j-lo-lucene1/
37 * @see http://blog.chinaunix.net/u/8780/showart_396199.html
38 * @see Lucene视频教程.rar
39 * (shortcut key to add package: ctrl+shift+m)
40 */
41 public class HelloWorld {
42
43 private String path = "D:/index";
44
45 /** *//**
46 * create index for strings
47 */
48 private void createIndex(){
49
50 IndexWriter writer;
51 try {
52 writer = new IndexWriter(path,new StandardAnalyzer(),true,IndexWriter.MaxFieldLength.UNLIMITED);
53 Document docA = new Document();
54 Field fieldA = new Field("content","搜索引擎",Field.Store.YES,Field.Index.ANALYZED);
55 docA.add(fieldA);
56
57 Document docB = new Document();
58 Field fieldB = new Field("content","建立索引",Field.Store.YES,Field.Index.ANALYZED);
59 docB.add(fieldB);
60
61 writer.addDocument(docA);
62 writer.addDocument(docB);
63 writer.optimize();
64 writer.close();
65
66 } catch (Exception e) {
67 // TODO Auto-generated catch block
68 e.printStackTrace();
69 }
70
71 }
72
73 /** *//**
74 * create a WildcardQuery term
75 * @return Query
76 */
77 private Query wildcardQuery(){
78 Term term = new Term("content","搜");
79 return new WildcardQuery(term);
80 }
81
82 /** *//**
83 * create a PhraseQuery term
84 * @return Query
85 */
86 private Query phraseQuery(){
87 PhraseQuery phraseQuery = new PhraseQuery();
88 phraseQuery.setSlop(2);
89 phraseQuery.add(new Term("content","搜"));
90 phraseQuery.add(new Term("content","擎"));
91 return phraseQuery;
92 }
93
94 /** *//**
95 * create a BooleanQuery term
96 * @return Query
97 */
98 private Query booleanQuery(){
99 Term term1 = new Term("content","搜");
100 Term term2 = new Term("content","引");
101
102 TermQuery termQuery1 = new TermQuery(term1);
103 TermQuery termQuery2 = new TermQuery(term2);
104
105 BooleanQuery booleanQuery = new BooleanQuery();
106 booleanQuery.add(termQuery1,BooleanClause.Occur.SHOULD);
107 booleanQuery.add(termQuery2,BooleanClause.Occur.SHOULD);
108
109 return booleanQuery;
110 }
111
112 private Query queryParser(){
113 QueryParser queryParser = new QueryParser("content",new StandardAnalyzer());
114 try {
115 return queryParser.parse("搜索 擎");
116 } catch (ParseException e) {
117 // TODO Auto-generated catch block
118 e.printStackTrace();
119 }
120 return null;
121 }
122
123 /** *//**
124 * search method
125 */
126 private void search(){
127 try {
128 IndexSearcher searcher = new IndexSearcher(path);
129 //Query query= wildcardQuery();
130 //Query query = phraseQuery();;
131 //Query query = booleanQuery();
132 Query query = queryParser();
133 TopDocCollector collector = new TopDocCollector(10);
134 searcher.search(query,collector);
135 ScoreDoc[] hits = collector.topDocs().scoreDocs;
136 //Hits hits = searcher.search(query);
137
138 Document doc;
139 for(int i=0;i<hits.length;i++){
140
141 System.out.println(hits[i].doc);
142 System.out.println(hits[i].score);
143
144 doc = searcher.doc(hits[i].doc);
145 System.out.println(doc.toString());
146 }
147
148 } catch (Exception e) {
149 // TODO Auto-generated catch block
150 e.printStackTrace();
151 }
152 }
153
154 /** *//**
155 * create index for *.txt
156 */
157 private void TxtFileIndexer() {
158 //indexDir is the directory that hosts Lucene's index files
159 File indexDir = new File(path);
160 //dataDir is the directory that hosts the text files that to be indexed
161 File dataDir = new File(path);
162 Analyzer luceneAnalyzer = new StandardAnalyzer();
163 File[] dataFiles = dataDir.listFiles();
164 IndexWriter writer;
165 try {
166 writer = new IndexWriter(indexDir,luceneAnalyzer,true,IndexWriter.MaxFieldLength.UNLIMITED);
167
168 long startTime = new Date().getTime();
169
170 for(int i=0;i<dataFiles.length;i++){
171 if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
172 System.out.println("Indexing file " + dataFiles[i].getCanonicalPath());
173
174 Document doc = new Document();
175 Reader txtReader = new FileReader(dataFiles[i]);
176 //document.add(Field.Text("path",dataFiles[i].getCanonicalPath()));
177 //document.add(Field.Text("contents",txtReader));
178 doc.add(new Field("path",dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.ANALYZED));
179 doc.add(new Field("contents",txtReader));
180 writer.addDocument(doc);
181
182 }
183 }
184 writer.optimize();
185 writer.close();
186 long endTime = new Date().getTime();
187
188 System.out.println("It takes " + (endTime - startTime) + " milliseconds to create index for the text files in directory " + dataDir.getPath());
189 } catch (Exception e) {
190 // TODO Auto-generated catch block
191 e.printStackTrace();
192 }
193 }
194
195 /** *//**
196 * search method with a parameter
197 * @param keywords
198 */
199 private void TxtFileSearcher(String keywords){
200 try{
201 FSDirectory directory = FSDirectory.getDirectory(path);
202 IndexSearcher searcher = new IndexSearcher(directory);
203
204 Term term = new Term("contents",keywords);
205 TermQuery termQuery = new TermQuery(term);
206
207 TopDocCollector collector = new TopDocCollector(10);
208 searcher.search(termQuery,collector);
209 ScoreDoc[] hits = collector.topDocs().scoreDocs;
210
211 System.out.println("共有" + searcher.maxDoc()+"条索引,命中" + hits.length + "条");
212
213 int docId;
214 Document doc;
215 for(int i=0;i<hits.length;i++){
216 docId = hits[i].doc;
217 System.out.println(docId);
218 //System.out.println(hits[i].score);
219 doc = searcher.doc(docId);
220 System.out.println(doc.toString());
221 }
222 }catch(Exception e){
223 e.printStackTrace();
224 }
225 }
226
227 public static void main(String[] args) {
228/**//*
229 System.out.println("请输入要查找的关键字,例如shinhwa:");
230 BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
231
232 String keywords = new String();
233 try {
234 keywords = stdin.readLine();
235
236 HelloWorld hw = new HelloWorld();
237
238 hw.TxtFileSearcher(keywords);
239 } catch (IOException e) {
240 // TODO Auto-generated catch block
241 e.printStackTrace();
242 }
243*/
244 HelloWorld hw = new HelloWorld();
245 hw.createIndex();
246 hw.search();
247 }
248}
249
250
251
不同版本的Lucene支持的函数不同,写的时候需要根据版本修改一下。这个小例子没实现什么功能了。默认情况下中文被分成单个字。TermQuery是以字或单词为单位搜索,WildcardQuery是使用正则表达式匹配,可在单词内部进行深度优先搜索。