看过王老师的信息检索导论,一直想学lucene,看看从实现的角度,搜索引擎是如何工作的。
正好工作中用到一点lucene,尝试使用,为后续分析做好准备。
截止到现在,lucene已经更新到4.8了,适配Java8,并做了很多封装。
自己会用3.6来进行学习。因为阅读lucene源码还是4版本一下的更为合适,而且一些简单查询,3.6也足够用了。
1 构建索引
public boolean indexBuild(String indexPath, String inputFile) {
boolean suc = true;
try {
Directory dir = FSDirectory.open(new File(indexPath));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36,
analyzer);
boolean create = true;
if (create) {
// Create a new index in the directory,
// removing any previously indexed documents:
iwc.setOpenMode(OpenMode.CREATE);
} else {
// Add new documents to an existing index:
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
// Optional: for better indexing performance, if you
// are indexing many documents, increase the RAM
// buffer. But if you do this, increase the max heap
// size to the JVM (eg add -Xmx512m or -Xmx1g):
// iwc.setRAMBufferSizeMB(256.0);
IndexWriter writer = new IndexWriter(dir, iwc);
FileInputStream fis = new FileInputStream(inputFile);
// make a new, empty document
Document doc = new Document();
// Add the path of the file as a field named "path". Use a
// field that is indexed (i.e. searchable), but don't tokenize
// the field into separate words and don't index term frequency
// or positional information:
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis, "UTF-8"));
String line = null;
while ((line = reader.readLine()) != null) {
int beginPos = line.indexOf("\t");
String hid = line.substring(0, beginPos);
String tags = line.substring(beginPos+1);
Field hidField = new Field("hid", hid, Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
hidField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(hidField);
Field tagsField = new Field("tags", tags, Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
tagsField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(tagsField);
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
// New index, so we just add the document (no old document
// can be there):
System.out.println("adding " + inputFile);
writer.addDocument(doc);
} else {
// Existing index (an old copy of this document may have
// been indexed) so
// we use updateDocument instead to replace the old one
// matching the exact
// path, if present:
writer.updateDocument(new Term("path", inputFile), doc);
}
}
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
return suc;
}
2 检索
最开始的时候使用QueryParser生成Query,发现不需要分析的字段,检索不出结果来。改为有Term生成Query就OK了。
private IndexReader reader = null;
private Analyzer analyzer = null;
private IndexSearcher searcher = null;
public boolean init(String indexPath) {
boolean suc = true;
try {
reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
searcher = new IndexSearcher(reader);
analyzer = new StandardAnalyzer(Version.LUCENE_36);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return suc;
}
public String queryTags(String hid) {
String result = null;
// QueryParser parser = new QueryParser(Version.LUCENE_36, "hid", analyzer);
try {
Term term = new Term("hid", hid);
Query query = new TermQuery(term);
int hitsPerPage = 1;
// Collect enough docs to show 5 pages
TopDocs results = searcher.search(query, 5 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
String str;
int start = 0;
// end pos for return docs
int end = Math.min(numTotalHits, hitsPerPage);
boolean raw = false;
for (int i = start; i < end; i++) {
if (raw) { // output raw format
String log = "doc=" + hits[i].doc + " score=" + hits[i].score;
System.out.println(log);
continue;
}
Document doc = searcher.doc(hits[i].doc);
String tags = doc.get("tags");
if (tags != null) {
//just for work
result = tags;
break;
//System.out.println((i + 1) + ". " + tags);
} else {
String log = (i + 1) + " No tags for " + hid;
System.out.println(log);
}
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
public void close() {
try {
searcher.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}