最近因为项目需要,开始了解lucene的应用,手头有一本《Lucene In Action》,不过一用起来才发现,我现在用2.0lucene包的情况下,该书第一个示例就无法正确编译通过,找了一些资料,终于算是调试通过,算是一个好的开始吧。
1.建立索引:
package demo.example.searcher;
import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class Indexer {
private static Log log = LogFactory.getLog(Indexer.class);
public static void main(String[] args) throws Exception {
File indexDir = new File("C:\\index");
File dataDir = new File("C:\\lucene\\src");
long start = new Date().getTime();
int numIndexed = index(indexDir, dataDir);
long end = new Date().getTime();
System.out.println("use:" + (end - start));
}
public static int index(File indexDir, File dataDir) {
int ret = 0;
try {
IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(), true);
writer.setUseCompoundFile(false);
indexDirectory(writer, dataDir);
ret = writer.docCount();
writer.optimize();
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
return ret;
}
public static void indexDirectory(IndexWriter writer, File dir) {
try {
File[] files = dir.listFiles();
for (File f : files) {
if (f.isDirectory()) {
indexDirectory(writer, f);
} else {
indexFile(writer, f);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void indexFile(IndexWriter writer, File f) {
try {
System.out.println("Indexing:" + f.getCanonicalPath());
Document doc = new Document();
Reader txtReader = new FileReader(f);
doc.add(new Field("contents", txtReader));
doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES, Field.Index.UN_TOKENIZED));
writer.addDocument(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
}
2.针对上面类建立的索引进行查询:
package demo.example.searcher;
import java.util.*;
import org.apache.lucene.search.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class Searcher {
private static Log log = LogFactory.getLog(Searcher.class);
public static void main(String[] args) {
String indexDir = "C:\\index";
String q = "查询关键字";
search(indexDir, q);
}
public static void search(String indexDir, String q) {
try {
IndexSearcher is = new IndexSearcher(indexDir);
QueryParser queryParser = new QueryParser("contents", new StandardAnalyzer());
Query query = queryParser.parse(q);
long start = new Date().getTime();
Hits hits = is.search(query);
long end = new Date().getTime();
System.out.println("use:" + (end - start));
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
System.out.println("The right file:" + doc.get("filename"));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
最后运行正常。
不过在运行测试的时候发现了一个不明白的问题:
在建立索引的文件都是Java类,在测试查询关键字信息的时候,中英文都很正常,但发现在java类源文件中的信息被过滤了,无法检索出来,这是怎么回事啊,lucene自动过滤类文件的注释信息么?