整理记录了在Lucene 4版本中的遍历读取本地文件夹数据,并创建索引的过程。
大致步骤:
- 确定索引的存放目录和待索引文件的路径
- 本地生成磁盘索引,准备添加数据
- 读取目录下的文件信息,分析文本文件,并使用内存索引进行索引。之后将内存索引添加到本地磁盘索引中去。(为了提高索引和检索的性能指标)
- 读子目录进行递归遍历,直至所有文件遍历完成。
- 关闭本地磁盘索引目录,索引创建完成。
package com.gangwu.lucene.tools;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
/**
* Lucene Index索引相关工具
* @author root
*
*/
public class IndexTools {
private static Logger logger = Logger.getLogger(IndexTools.class);
public static void main(String[] args) {
int nums = indexBuilder(new File("Dest_Index_Path"), new File("Test_File_Path"));
System.out.println("doc counts is : " + nums);
}
/**
* 索引创建函数.<br>
* 生成IndexWriter创建索引,调用子目录索引函数,并优化存储本地磁盘索引
* @param indexPath 指定索引目录
* @param dataPath 待分析目录
* @return 返回的文档总数
*/
public static int indexBuilder(File indexPath, File dataPath) {
if (!dataPath.exists() || !dataPath.isDirectory() || !dataPath.canRead()) {
try {
throw new IOException(dataPath + " 不存在或不允许访问!");
} catch (IOException e) {
e.printStackTrace();
}
}
int num = 0;
try {
Analyzer analyzer = new StandardAnalyzer();//文本分析器
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
conf.setUseCompoundFile(true);//采用多文件索引结构,默认为复合索引
Directory fsDir = FSDirectory.open(indexPath);
IndexWriter fsdWriter = new IndexWriter(fsDir, conf);
subIndexBuilder(fsdWriter, dataPath);
num = fsdWriter.numDocs();
fsdWriter.forceMerge(5);//优化压缩段,执行优化的方法,参数表示优化称几段索引
fsdWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
return num;
}
/**
* 递归函数,递归分析目录.<br>
* 如果找到子目录,继续递归;如果找到文件分析文件内容并建立索引
* @param fsdWriter IndexWriter
* @param subPath 待分析目录
*/
private static void subIndexBuilder(IndexWriter fsdWriter, File subPath) {
File[] fileLIst = subPath.listFiles();
for (int i = 0; i < subPath.length(); i++) {
File file = fileLIst[i];
if (file.isDirectory()) {
subIndexBuilder(fsdWriter, file);
} else if (IsValidType(file.getName())) {
fileIndexBUilder(fsdWriter, file);
}
}
}
/**
* 创建RAM内存索引,生成并添加新文档,且合并到本地磁盘索引中
* @param fsdWriter IndexWriter
* @param subFile 待分析目录
*/
private static void fileIndexBUilder(IndexWriter fsdWriter, File subFile) {
if (subFile.isHidden() || !subFile.exists() || !subFile.canRead()) {
return;
}
try {
Directory ramDir = new RAMDirectory();
Analyzer analyzer = new StandardAnalyzer();//文本分析器
IndexWriterConfig conf = new IndexWriterConfig(Version.LATEST, analyzer);
conf.setUseCompoundFile(true);//采用多文件索引结构,默认为复合索引
IndexWriter ramWriter = new IndexWriter(ramDir, conf);
FileReader fileReader = new FileReader(subFile);
System.out.println("-> 创建索引 : " + subFile.getCanonicalPath());
Document document = new Document();
Field fieldName = new TextField("name", subFile.getName(), Store.YES);
document.add(fieldName);
Field fieldPath = new TextField("path", subFile.getAbsolutePath(), Store.YES);
document.add(fieldPath);
Field fieldContent = new TextField("content", fileReader);
document.add(fieldContent);
ramWriter.addDocument(document);//文档添加到内存索引
ramWriter.close();//关闭内存索引,保存添加的数据
fsdWriter.addIndexes(new Directory[] {ramDir});//添加内存索引到磁盘索引
fileReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 判断当前文件名是否符合文件后缀的要求
* @param name 文件名
* @return true 有效文件
*/
private static boolean IsValidType(String name) {
if (name.endsWith(".txt")) {
return true;
} else {
return false;
}
}
}