参考官网://http://lucene.apache.org/core/5_3_1/demo/overview-summary.html#overview_description
package example;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
//http://lucene.apache.org/core/5_3_1/demo/overview-summary.html#overview_description
public class FileIndex {
public static void main(String[] args) throws Exception
{
boolean create=false;
String docsPath="E:\\LuceneDocument";
String indexPath="E:\\LuceneIndex";
final Path docDir = Paths.get(docsPath);
if (!Files.isReadable(docDir))
{
System.out.println("Document directory '" +docDir.toAbsolutePath()+ "' does not exist or is not readable");
System.exit(1);
}
//Directory这个类代表了Lucene的索引的存储的位置,这是一个抽象类,它目前有两个实现:
//第一个是FSDirectory,它表示一个存储在文件系统中的索引的位置,
//第二个是RAMDirectory,它表示一个存储在内存当中的索引的位置。
Directory directory = FSDirectory.open(Paths.get(indexPath));
//在一个文档被索引之前,首先需要对文档内容进行分词处理,这部分工作就是由 Analyzer来做的。
//Analyzer类是一个抽象类,它有多个实现。针对不同的语言和应用需要选择适合的 Analyzer。
//Analyzer把分词后的内容交给 IndexWriter来建立索引。
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
//IndexWriter是Lucene用来创建索引的一个核心的类,他的作用是把一个个的Document对象加到索引中来。
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
if(create)
{
indexWriterConfig.setOpenMode(OpenMode.CREATE);
}
else
{
indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
IndexDocs(indexWriter, docDir);
indexWriter.close();
}
static void IndexDocs(final IndexWriter writer, Path path) throws IOException
{
if(Files.isDirectory(path))
{
Files.walkFileTree(path, new SimpleFileVisitor<Path>(){
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
{
try
{
IndexDoc(writer, file, attrs.lastModifiedTime().toMillis());
}
catch(IOException ignore)
{
//don't index files that can't be read.
}
return FileVisitResult.CONTINUE;
}
});
}
else
{
IndexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
public static void IndexDoc(IndexWriter writer, Path file, long lastModified) throws IOException
{
try(InputStream stream=Files.newInputStream(file))
{
//Document是用来描述文档的,这里的文档可以指一个 HTML 页面,一封电子邮件,或者是一个文本文件。
//一个 Document对象由多个 Field对象组成的。
//可以把一个Document对象想象成数据库中的一个记录,而每个 Field对象就是记录的一个字段。
Document document=new Document();
//Field对象是用来描述一个文档的某个属性的,比如一封电子邮件的标题和内容可以用两个 Field对象分别描述。
/*
Add the path of the file as a field named "path". Use a field that is indexed (i.e. searchable),
but don't tokenize the field into separate words and don't index term frequency or positional information
*/
Field pathField=new StringField("path", file.toString(), Field.Store.YES);
document.add(pathField);
document.add(new LongField("modified", lastModified, Field.Store.NO));
//Add the contents of the file to a field named "contents".
//Specify a Reader, so that the text of the file is tokenized and indexed, but not stored.
document.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
if(writer.getConfig().getOpenMode()==OpenMode.CREATE)
{
//New index, so we just add the document (no old document can be there):
System.out.println("adding "+file);
writer.addDocument(document);
}
else
{
//Existing index (an old copy of this document may have been indexed) so
//we use updateDocument instead to replace the old one matching the exact path, if present:
System.out.println("updating "+file);
/*
Term是搜索的基本单位,一个 Term对象有两个 String类型的域组成。
生成一个 Term对象可以有如下一条语句来完成:Term term = new Term(“fieldName”,”queryWord”);
其中第一个参数代表了要在文档的哪一个 Field上进行查找,第二个参数代表了要查询的关键词。
*/
writer.updateDocument(new Term("path", file.toString()), document);
}
}
}
}
关于如何检索,等待下一篇文章。
自己补理论,可以参考:http://www.cnblogs.com/xing901022/p/3933675.html
参考文献:
[1] Mendes, Pablo N, Jakob, Max, Garc&#, et al. DBpedia spotlight: Shedding light on the web of documents[C]// Proceedings of the 7th International Conference on Semantic Systems. ACM, 2011:1-8.
[2] Han X, Sun L. A Generative Entity-Mention Model for Linking Entities with Knowledge Base.[J]. Proceeding of Acl, 2011:945-954.
[4] http://alias-i.com/lingpipe/demos/tutorial/ne/read-me.html
[5] http://wiki.dbpedia.org/Downloads2014
[6] http://www.oschina.net/p/jieba(结巴分词)