package com.cjr.lucene;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* 创建索引 对某个目录下的txt文件创建索引
*
* @author chenjiarong
*
*/
public class Indexer {
/**
* 索引存放目录
*/
private static String INDEXDIR = "F:\\工作区\\test";
/**
* 文件存在目录
*/
private static String DATADIR = "F:\\其它\\文学";
/**
* 后缀
*/
private static String SUFFIX = ".txt";
/**
* 索引写入器
*/
private IndexWriter indexWriter;
public static void main(String[] args) throws IOException {
Indexer indexer = new Indexer(INDEXDIR);
int numIndexed = 0;
numIndexed = indexer.index(DATADIR, new TextFilesFilter());
System.out.println("numIndexed: " + numIndexed);
indexer.close();
}
/**
* 实例化索引写入器
*
* @param inderDir
* @throws IOException
*/
public Indexer(String inderDir) throws IOException {
// 目录
Directory directory = FSDirectory.open(new File(inderDir));
// 分词器
Analyzer analyzer = new StandardAnalyzer();
// 索引写入器配置
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_2,
analyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
indexWriter = new IndexWriter(directory, config);
}
/**
* 返回被索引文档数
*
* @param dataDir
* @param fileFilter
* @return
* @throws IOException
*/
public int index(String dataDir, FileFilter fileFilter) throws IOException {
File[] files = new File(dataDir).listFiles();
for (File file : files) {
if (!file.isDirectory() && !file.isHidden() && file.exists()
&& file.canRead()
&& (fileFilter == null || fileFilter.accept(file))) {
indexFile(file);
}
}
return indexWriter.numDocs();
}
/**
* 向Lucene索引中添加文档
*
* @param file
* @throws IOException
*/
private void indexFile(File file) throws IOException {
// 打印文本文件的完整路径
System.out.println("Indexing " + file.getCanonicalPath());
Document document = getDocument(file);
indexWriter.addDocument(document);
}
/**
* 获取文件内容
*
* @param file
* @return
* @throws IOException
*/
private Document getDocument(File file) throws IOException {
Document document = new Document();
// 文件内容
document.add(new TextField("contents", getFileContent(file),Field.Store.YES));
// 文件名称
document.add(new TextField("filename", file.getName(), Field.Store.YES));
// 文件完整路径
document.add(new TextField("fullpath", file.getCanonicalPath(),
Field.Store.YES));
return document;
}
/**
* 根据文件的全路径获得所有的文件内容
*
* @param fileName
* @param charset
* @return
* @throws Exception
*/
public String getFileContent(File file) {
String everything = null;
try {
everything = new String(Files.readAllBytes(Paths.get(file.getCanonicalPath())),"utf-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return everything;
}
/**
* 关闭 indexWriter
*
* @throws IOException
*/
public void close() throws IOException {
indexWriter.close();
}
/**
* 文本文件过滤器
*
* @author chenjiarong
*
*/
private static class TextFilesFilter implements FileFilter {
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(SUFFIX);
}
}
}
lucene 创建索引
最新推荐文章于 2022-05-28 16:27:58 发布