初识lucene

最新推荐文章于 2024-08-18 21:36:31 发布

weixin_30472035

最新推荐文章于 2024-08-18 21:36:31 发布

阅读量44

点赞数

文章标签： java

原文链接：http://www.cnblogs.com/xuyadong/p/3727075.html

版权

Lucene是什么？

Lucene是一款高性能、可扩展的信息检索工具库。信息检索是指文档搜索、文档内信息搜索或者文档相关的元数据搜索等操作。

第一个lucene程序，简单建立索引文档

package com.sbq.studylucene;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Indexer {
private IndexWriter writer;

public static void main(String[] args) throws IOException, CloneNotSupportedException {
args = new String[]{"D:\\indexdir","D:\\datadir"};
String indexDir = args[0];// 指定目录创建索引
String dataDir = args[1];// 指定进行搜索的.txt文件
long startTime = System.currentTimeMillis();// 记录开始时间
Indexer indexer = new Indexer(indexDir);
int numIndexd;
try{
numIndexd = indexer.index(dataDir, new TextFilesFilter());
}finally{
indexer.close();
}
long endTime = System.currentTimeMillis();

System.out.println("Indexing "+ numIndexd +" files took "+(endTime - startTime )+"ms");

}

// 构造方法
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));
// 获取IndexWriter实例
writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_48,//IndexWriterConfig实例可以设置所有关于IndexWriter对象的参数(since 3.1)
new StandardAnalyzer(Version.LUCENE_48)));
}

// 关闭IndexWriter
public void close() throws IOException {
writer.close();
}

// 返回被索引的文档数
public int index(String dataDir, FileFilter filter) throws IOException {
File[] files = new File(dataDir).listFiles();
for (File file : files) {
if (!file.isDirectory() && !file.isHidden() && file.exists()
&& file.canRead()
//当过滤器为空或过滤器的accept(File file)方法返回true才进行索引
&& (filter == null || filter.accept(file))) {
indexFile(file);
}
}
return writer.numDocs();
}

//只索引.txt文件，采用FileFilter
private static class TextFilesFilter implements FileFilter{

public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".txt");
}

}

//获取文档
protected Document getDocument(File f) throws IOException {
Document doc = new Document();
// 索引文件内容
doc.add(new Field("contents", new FileReader(f)));
// 索引文件名
doc.add(new Field("filename", f.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 索引文件完整路径
doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
return doc;
}

//向lucene索引中添加文档
public void indexFile(File file) throws IOException {
System.out.println("Indexing " + file.getCanonicalPath());
Document doc = getDocument(file);
writer.addDocument(doc);
}

}

简单的搜索程序：

package com.sbq.studylucene;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {

public static void search(String indexDir, String q) throws IOException,
ParseException {
// 获取需要搜索的目录
Directory dir = FSDirectory.open(new File(indexDir));
// 打开索引文件
DirectoryReader ireader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(ireader);

// 创建解析对象
QueryParser parser = new QueryParser(Version.LUCENE_48, "contents",
new StandardAnalyzer(Version.LUCENE_48));
//解析查询字符串
Query query = parser.parse(q);
long start = System.currentTimeMillis();
//进行搜索
TopDocs hits = is.search(query, 10);
long end = System.currentTimeMillis();
//输出搜索状态
System.out.println("Found " + hits.totalHits + "document(s) in "
+ (end - start) + "ms that matched query '" + q + "':");

//遍历
for(ScoreDoc sd:hits.scoreDocs){
//返回匹配文本
Document doc = is.doc(sd.doc);
//显示匹配文件名
System.out.println(doc.get("fullpath"));
}

//关闭流
ireader.close();
dir.close();
}
public static void main(String[] args) throws IOException, ParseException {
String indexDir = "D:\\indexdir";
String q = "Rome";
search(indexDir, q);
}
}

应该注意的是TopDocs对象只包括对应文档的引用。匹配文档不是在搜索过程中立即被加载的，而是从索引中慢加载的--即只有IndexSearcher.doc(Int i)方法被调用时才被加载。

转载于:https://www.cnblogs.com/xuyadong/p/3727075.html

weixin_30472035

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
初识lucene

Lucene是什么？Lucene是一款高性能、可扩展的信息检索工具库。信息检索是指文档搜索、文档内信息搜索或者文档相关的元数据搜索等操作。第一个lucene程序，简单建立索引文档package com.sbq.studylucene;import java.io.File;import java.io.FileFilter;import java.io.FileReader;im...
复制链接

扫一扫