0. 对指定目录中的文件进行索引并执行搜索
- Lucene 版本为 8.0.0,需要 JDK 8.0 及以上版本。
- 注意:这里在遍历文档目录时,没有采用递归函数实现,而是使用 Files 工具类,Files 工具类效率更高。而且在使用递归函数时,递归的深度,受虚拟机方法栈深度限制。简单测试了一下,我的机器递归深度为 36631。递归对比循环,成本更高。
- 下面的索引与搜索程序的重点分别在 Field 类和 Query 类的使用。
1.1 索引程序如下:
import static org.junit.Assert.*;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.BasicFileAttributes;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
public class Demo3_Indexing {
private static ThreadLocal<Long> startTime = new ThreadLocal<Long>();
@Test
public void test_temp() throws Exception {
String title = "String.java";
System.out.println(title.substring(0, title.indexOf(".java")));
}
@Test
public void test_path() throws Exception {
System.out.println(Paths.get("doc", new String[0]));
}
private static String indexPathStr = "E:\\temp\\lucene-index";
private static Path indexPath = null;
private static String docsPathStr = "E:\\temp\\lucene-docs\\src";
private static Path docsPath = null;
private static File docsFile = null;
private static int count = 0;
private static boolean isInited_MinTileHashCodeValue = false;
private static int minTitleHashCodeValue = 0;
private static int maxTitleHashCodeValue = 0;
private static boolean isInited_MinDocCreatedTimeValue = false;
private static long minDocCreatedTimeValue = 0;
private static long maxDocCreatedTimeValue = 0;
private static void updateMinOrMaxTitleHashCodeValue(int hashCode) {
if (hashCode > maxTitleHashCodeValue) {
maxTitleHashCodeValue = hashCode;
}
if (!isInited_MinTileHashCodeValue && (isInited_MinTileHashCodeValue = true) == true) {
minTitleHashCodeValue = hashCode;
}
if (hashCode < minTitleHashCodeValue) {
minTitleHashCodeValue = hashCode;
}
}
private static void updateMinOrMaxDocCreatedTimeValue(long createdTime) {
if (createdTime > maxDocCreatedTimeValue) {
maxDocCreatedTimeValue = createdTime;
}
if (!isInited_MinDocCreatedTimeValue && (isInited_MinDocCreatedTimeValue = true) == true) {
minDocCreatedTimeValue = createdTime;
}
if (createdTime < minDocCreatedTimeValue) {
minDocCreatedTimeValue = createdTime;
}
}
static {
docsPath = Paths.get(docsPathStr, new String[0]);
if (Files.notExists(docsPath, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
System.out.println("指定文件目录不存在,docsPath:" + docsPathStr);
System.out.println("程序退出");
System.exit(1);
}
docsFile = docsPath.toFile();
indexPath = Paths.get(indexPathStr, new String[0]);
if (Files.notExists(indexPath, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
System.out.println("(不要慌,Lucene 会为我们创建的)指定索引文件不存在,indexPath:" + indexPathStr);
}
}
public static void recurveFiles(File file) {
if (file.isDirectory()) {
File[] files = file.listFiles();
for(File f : files) {
recurveFiles(f);
}
}else {
count++;
}
}
public static void walkPaths(final IndexWriter writer, Path path){
try {
if (Files.isDirectory(path, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
if (!Files.isDirectory(file, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
count++;
doIndexing(writer, file);
}
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
return FileVisitResult.CONTINUE;
}
});
}else {
count++;
doIndexing(writer, path);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void doIndexing(IndexWriter writer, Path filePath) {
try {
long docCreatedTime = System.currentTimeMillis();
Document document = new Document();
String title = filePath.getFileName().toString();
String titleNameWithoutSuffix = null;
if (title.endsWith("java")) {
titleNameWithoutSuffix = title.substring(0, title.indexOf(".java"));
}
if (titleNameWithoutSuffix != null) {
document.add(new TextField("title", titleNameWithoutSuffix, Field.Store.YES));
}else {
document.add(new TextField("title", title, Field.Store.YES));
}
document.add(new LongPoint("createdTime", new long[] {docCreatedTime}));
document.add(new StoredField("createdTimeValue", docCreatedTime));
updateMinOrMaxDocCreatedTimeValue(docCreatedTime);
document.add(new TextField("body",
new InputStreamReader(Files.newInputStream(filePath, StandardOpenOption.READ)
, StandardCharsets.UTF_8)));
document.add(new StoredField("bodyValue", Files.readAllBytes(filePath)));
int titleHashCode = (titleNameWithoutSuffix == null ? title.hashCode() : titleNameWithoutSuffix.hashCode());
document.add(new IntPoint("titleHashCode", new int[] {titleHashCode}));
document.add(new StoredField("titleHashCodeValue", titleHashCode));
updateMinOrMaxTitleHashCodeValue(titleHashCode);
document.add(new StoredField("path", filePath.toString()));
writer.addDocument(document);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
System.out.println("beginning...");
count = 0;
long st = System.currentTimeMillis();
long et = System.currentTimeMillis();
count = 0;
st = System.currentTimeMillis();
Directory directory = FSDirectory.open(indexPath);
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(directory, config);
walkPaths(writer, docsPath);
et = System.currentTimeMillis();
System.out.println("indxing file cost time " + (et - st) + "ms, count is " + count);
System.out.println("title hash code range is from " + minTitleHashCodeValue + " to " + maxTitleHashCodeValue);
System.out.println("document created time range is from " + minDocCreatedTimeValue
+ " to " + maxDocCreatedTimeValue);
}
}
1.2 搜索程序如下:
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.xml.builders.PointRangeQueryBuilder;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Demo3_Searching {
private static String indexPathStr = "E:\\temp\\lucene-index";
private static Path indexPath = null;
static {
indexPath = Paths.get(indexPathStr, new String[0]);
if (Files.notExists(indexPath, new LinkOption[] {LinkOption.NOFOLLOW_LINKS})) {
System.out.println("指定索引文件不存在,无法执行搜索,indexPath:" + indexPathStr);
System.out.println("程序退出");
System.exit(1);
}
}
public static String getStringValueFromBytes(byte[] bytes) {
StringBuilder builder = new StringBuilder();
if (bytes != null && bytes.length > 0) {
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(bytes)
, StandardCharsets.UTF_8);
try {
char[] buf = new char[1024];
int res = 0;
while((res = reader.read(buf)) != -1) {
builder.append(buf, 0, res);
}
} catch (IOException e) {
e.printStackTrace();
}
}
return builder.toString();
}
public static void doSearching(IndexSearcher searcher, Query query, int histNum) {
try {
long startTime = System.currentTimeMillis();
TopDocs topDocs = searcher.search(query, histNum);
System.out.println("搜索耗时:" + (System.currentTimeMillis() - startTime) + "毫秒");
if (topDocs != null) {
System.out.println("搜索结果为: \n--------------------");
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for(ScoreDoc hit : scoreDocs) {
Document doc = searcher.doc(hit.doc);
String title = doc.get("title");
String path = doc.get("path");
Field titleHashCodeValue = (Field) doc.getField("titleHashCodeValue");
Field bodyValue = (Field) doc.getField("bodyValue");
String bodyValueStr = getStringValueFromBytes(bodyValue.binaryValue().bytes);
System.out.println("score=" + hit.score + ", title : "
+ title + ", hashCod=" + titleHashCodeValue.stringValue()
+ ", \npath : " + path);
System.out.println("[body]\n" + bodyValueStr.substring(0,
(bodyValueStr.length() > 10 ? 10 : bodyValueStr.length())));
System.out.println("----------");
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException, ParseException {
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
int histNum = 10;
Query query = new TermQuery(new Term("body", "string"));
System.out.println("查询语句为:" + query);
doSearching(searcher, query, histNum);
}
}